From a86b4c58e8f72f4903d873d25510f53f7577366f Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 14 Oct 2025 15:53:10 -0700
Subject: [PATCH 01/51] remove attn output view kernel (#26680)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/attention/layer.py                               | 6 +++---
 vllm/v1/attention/backends/flash_attn.py              | 2 +-
 vllm/v1/attention/backends/flashinfer.py              | 2 +-
 vllm/v1/attention/backends/flex_attention.py          | 2 +-
 vllm/v1/attention/backends/rocm_aiter_fa.py           | 2 +-
 vllm/v1/attention/backends/rocm_aiter_unified_attn.py | 2 +-
 vllm/v1/attention/backends/rocm_attn.py               | 2 +-
 vllm/v1/attention/backends/tree_attn.py               | 2 +-
 vllm/v1/attention/backends/triton_attn.py             | 2 +-
 vllm/v1/attention/backends/xformers.py                | 2 +-
 10 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 929c3b6a4906b..fe9de65b52c66 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -346,7 +346,7 @@ class Attention(nn.Module, AttentionLayerBase):
 
         if self.use_output:
             output_shape = output_shape if output_shape is not None else query.shape
-            output = torch.zeros(output_shape, dtype=output_dtype, device=query.device)
+            output = torch.empty(output_shape, dtype=output_dtype, device=query.device)
             hidden_size = output_shape[-1]
             # Reshape the query, key, and value tensors.
             # NOTE(woosuk): We do this outside the custom op to minimize the
@@ -705,7 +705,7 @@ class MLAAttention(nn.Module, AttentionLayerBase):
                 self.calc_kv_scales(q, kv_c_normed, k_pe)
 
             if self.attn_backend.accept_output_buffer:
-                output = torch.zeros(output_shape, dtype=q.dtype, device=q.device)
+                output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
                 self.impl.forward(
                     self,
                     q,
@@ -722,7 +722,7 @@ class MLAAttention(nn.Module, AttentionLayerBase):
                 )
         else:
             if self.attn_backend.accept_output_buffer:
-                output = torch.zeros(output_shape, dtype=q.dtype, device=q.device)
+                output = torch.empty(output_shape, dtype=q.dtype, device=q.device)
                 torch.ops.vllm.unified_mla_attention_with_output(
                     q,
                     kv_c_normed,
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index fa4e34536135d..9e0c125d9edb7 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -530,7 +530,7 @@ class FlashAttentionImpl(AttentionImpl):
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         attn_type = self.attn_type
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 0fa71afa62eef..ee32f7e2904f7 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -857,7 +857,7 @@ class FlashInferImpl(AttentionImpl):
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         if self.bmm1_scale is None:
             self.bmm1_scale = layer._q_scale_float * layer._k_scale_float * self.scale
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 2595851e5042d..902872bb25b33 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -767,7 +767,7 @@ class FlexAttentionImpl(AttentionImpl):
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
             # query = self.view_as_4d(query).permute(0, 2, 1, 3)
             # return torch.empty_like(query)
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index cce43b220da77..7c73611d4a58a 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -485,7 +485,7 @@ class AiterFlashAttentionImpl(AttentionImpl):
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         # IMPORTANT!
         # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
index 14184944934fa..27b072106268b 100644
--- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -130,7 +130,7 @@ class RocmAiterUnifiedAttentionImpl(RocmAttentionImpl):
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         assert attn_metadata.use_cascade is False
 
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 5245c7f449259..8b7ce90a3ccae 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -299,7 +299,7 @@ class RocmAttentionImpl(AttentionImpl):
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         assert attn_metadata.use_cascade is False
 
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index aab90cfd1fe0d..ee6ead9ad9b35 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -379,7 +379,7 @@ class TreeAttentionImpl(AttentionImpl):
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         # Cache the input KVs.
         key_cache, value_cache = kv_cache.unbind(0)
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 9d1d007a08e4c..9746a0eb58bd2 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -298,7 +298,7 @@ class TritonAttentionImpl(AttentionImpl):
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         assert attn_metadata.use_cascade is False
 
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index 41c543c18adcc..457b15ebdd82f 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -354,7 +354,7 @@ class XFormersAttentionImpl(AttentionImpl):
 
         if attn_metadata is None:
             # Profiling run.
-            return output
+            return output.fill_(0)
 
         # Cache the input KVs.
         key_cache, value_cache = kv_cache.unbind(0)

From 4aed506b6538ec4f284c480bf4449e9dc5f72054 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 14 Oct 2025 16:27:44 -0700
Subject: [PATCH 02/51] [Core] Streamline some structured output related code
 (#26737)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/core/test_scheduler.py               | 18 +++--
 .../unit/test_kv_connector_lifecyle.py        |  2 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py  | 24 +++----
 tests/v1/worker/test_gpu_model_runner.py      | 24 +++----
 vllm/v1/core/sched/output.py                  |  5 +-
 vllm/v1/core/sched/scheduler.py               | 65 +++++++++----------
 vllm/v1/request.py                            | 18 +++--
 vllm/v1/structured_output/__init__.py         | 36 +++++-----
 vllm/v1/structured_output/backend_guidance.py |  2 +-
 vllm/v1/structured_output/request.py          | 44 +++++++------
 vllm/v1/structured_output/utils.py            |  9 +--
 vllm/v1/worker/gpu_model_runner.py            |  6 +-
 vllm/v1/worker/tpu_model_runner.py            |  6 +-
 13 files changed, 121 insertions(+), 138 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 76408fba2e169..aaac2deb12ac2 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -30,7 +30,6 @@ from vllm.v1.kv_cache_interface import (
 from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
-from vllm.v1.structured_output.request import StructuredOutputRequest
 
 from .utils import EOS_TOKEN_ID, create_requests, create_scheduler
 
@@ -335,10 +334,10 @@ def test_stop_via_update_from_output():
             requests[0].request_id: [],
             requests[1].request_id: [10],
         },
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -383,10 +382,10 @@ def test_stop_via_update_from_output():
             requests[0].request_id: [10, 42],
             requests[1].request_id: [13],
         },
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -429,10 +428,10 @@ def test_stop_via_update_from_output():
             requests[0].request_id: [10, 11],
             requests[1].request_id: [],
         },
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -470,10 +469,10 @@ def test_stop_via_update_from_output():
         total_num_scheduled_tokens=3,
         scheduled_encoder_inputs={},
         scheduled_spec_decode_tokens={requests[0].request_id: [EOS_TOKEN_ID, 10]},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -1941,7 +1940,6 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
         sampling_params=sampling_params,
         pooling_params=None,
         eos_token_id=EOS_TOKEN_ID,
-        structured_output_request=StructuredOutputRequest(sampling_params),
     )
     scheduler.add_request(request)
     output = scheduler.schedule()
diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
index 0bb67b574fa14..b5c8f378be182 100644
--- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
+++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py
@@ -26,7 +26,7 @@ def _make_empty_scheduler_output():
         num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
         kv_connector_metadata=SharedStorageConnectorMetadata(),
     )
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index df9fcdc37fa37..e471174ef6744 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -89,10 +89,10 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         total_num_scheduled_tokens=total_num_scheduled_tokens,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -168,10 +168,10 @@ def test_update_states_request_finished(model_runner):
         total_num_scheduled_tokens=0,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids={req_id},
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -198,10 +198,10 @@ def test_update_states_request_resumed(model_runner):
         total_num_scheduled_tokens=0,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -225,10 +225,10 @@ def test_update_states_request_resumed(model_runner):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -256,10 +256,10 @@ def test_update_states_no_changes(model_runner):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -291,10 +291,10 @@ def test_update_states_request_unscheduled(model_runner):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 817cd7f10c1c6..fe52f565c8a86 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -146,10 +146,10 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         total_num_scheduled_tokens=total_num_scheduled_tokens,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -212,10 +212,10 @@ def test_update_states_request_finished(model_runner, dist_init):
         total_num_scheduled_tokens=0,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids={req_id},
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -244,10 +244,10 @@ def test_update_states_request_resumed(model_runner, dist_init):
         total_num_scheduled_tokens=0,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -273,10 +273,10 @@ def test_update_states_request_resumed(model_runner, dist_init):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -366,10 +366,10 @@ def test_update_states_no_changes(model_runner, dist_init):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
@@ -403,10 +403,10 @@ def test_update_states_request_unscheduled(model_runner, dist_init):
         total_num_scheduled_tokens=1,
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
-        num_common_prefix_blocks=0,
+        num_common_prefix_blocks=[],
         finished_req_ids=set(),
         free_encoder_mm_hashes=[],
-        structured_output_request_ids={},
+        structured_output_request_ids=[],
         grammar_bitmask=None,
     )
 
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index bce15e1a476fd..619dcd178a13a 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -165,9 +165,8 @@ class SchedulerOutput:
     # freed from the encoder cache.
     free_encoder_mm_hashes: list[str]
 
-    # Dict of request ids to their index within the batch
-    # for filling the next token bitmask
-    structured_output_request_ids: dict[str, int]
+    # ids of structured outputs requests included in the bitmask, in order.
+    structured_output_request_ids: list[str]
     # the bitmask for the whole batch
     grammar_bitmask: "npt.NDArray[np.int32] | None"
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 9a1d31268ab7c..08368b7d99efe 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -5,7 +5,7 @@ import itertools
 import time
 from collections import defaultdict
 from collections.abc import Iterable
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
@@ -34,6 +34,10 @@ from vllm.v1.request import Request, RequestStatus
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
 
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
 logger = init_logger(__name__)
 
 
@@ -608,11 +612,8 @@ class Scheduler(SchedulerInterface):
             scheduled_spec_decode_tokens,
             req_to_new_blocks,
         )
-        scheduled_requests = (
-            scheduled_new_reqs + scheduled_running_reqs + scheduled_resumed_reqs
-        )
         structured_output_request_ids, grammar_bitmask = self.get_grammar_bitmask(
-            scheduled_requests, scheduled_spec_decode_tokens
+            num_scheduled_tokens.keys(), scheduled_spec_decode_tokens
         )
         scheduler_output = SchedulerOutput(
             scheduled_new_reqs=new_reqs_data,
@@ -876,32 +877,28 @@ class Scheduler(SchedulerInterface):
 
     def get_grammar_bitmask(
         self,
-        requests: list[Request],
+        scheduled_request_ids: Iterable[str],
         scheduled_spec_decode_tokens: dict[str, list[int]],
-    ):
-        # NOTE: structured_output_request_ids maps
-        # a request's (request that uses structured output)
-        # request_id to its index in the batch.
-        # This will help us determine to slice the grammar bitmask
-        # and only applies valid mask for requests that
-        # uses structured decoding.
-        structured_output_request_ids: dict[str, int] = {}
-        for i, req in enumerate(requests):
-            if req.use_structured_output:
-                # PERF: in case of chunked prefill,
-                # request might not include any new tokens.
-                # Therefore, we might introduce some additional
-                # cycle to fill in the bitmask, which could be a big no-op.
-                structured_output_request_ids[req.request_id] = i
-
+    ) -> tuple[list[str], "npt.NDArray[np.int32] | None"]:
+        # Collect list of scheduled request ids that use structured output.
+        # The corresponding rows of the bitmask will be in this order.
+        # PERF: in case of chunked prefill,
+        # request might not include any new tokens.
+        # Therefore, we might introduce some additional
+        # cycle to fill in the bitmask, which could be a big no-op.
+        structured_output_request_ids = [
+            req_id
+            for req_id in scheduled_request_ids
+            if (req := self.requests.get(req_id)) and req.use_structured_output
+        ]
         if not structured_output_request_ids:
-            bitmask = None
-        else:
-            bitmask = self.structured_output_manager.grammar_bitmask(
-                self.requests,
-                structured_output_request_ids,
-                scheduled_spec_decode_tokens,
-            )
+            return structured_output_request_ids, None
+
+        bitmask = self.structured_output_manager.grammar_bitmask(
+            self.requests,
+            structured_output_request_ids,
+            scheduled_spec_decode_tokens,
+        )
         return structured_output_request_ids, bitmask
 
     def update_from_output(
@@ -1013,12 +1010,10 @@ class Scheduler(SchedulerInterface):
                 new_logprobs = logprobs.slice(req_index, req_index + 1)
 
             if new_token_ids and self.structured_output_manager.should_advance(request):
-                # NOTE: structured_output_request
-                # should not be None if use_structured_output, we have
-                # checked above, so safe to ignore type warning
-                request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
-                    req_id, new_token_ids
-                )
+                struct_output_request = request.structured_output_request
+                assert struct_output_request is not None
+                assert struct_output_request.grammar is not None
+                struct_output_request.grammar.accept_tokens(req_id, new_token_ids)
 
             if num_nans_in_logits is not None and req_id in num_nans_in_logits:
                 request.num_nans_in_logits = num_nans_in_logits[req_id]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 5926bf5b46ee9..864b0eb7fa410 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -40,7 +40,6 @@ class Request:
         prompt_embeds: torch.Tensor | None = None,
         mm_features: list[MultiModalFeatureSpec] | None = None,
         lora_request: Optional["LoRARequest"] = None,
-        structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: str | None = None,
         priority: int = 0,
         trace_headers: Mapping[str, str] | None = None,
@@ -54,11 +53,12 @@ class Request:
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
-        self.structured_output_request = structured_output_request
+        self.structured_output_request = StructuredOutputRequest.from_sampling_params(
+            sampling_params
+        )
         self.arrival_time = arrival_time if arrival_time is not None else time.time()
 
         self.status = RequestStatus.WAITING
-        self.use_structured_output = False
         self.events: list[EngineCoreEvent] = []
         self.stop_reason: int | str | None = None
 
@@ -72,9 +72,8 @@ class Request:
             # Generative models.
             assert sampling_params.max_tokens is not None
             self.max_tokens = sampling_params.max_tokens
-            if sampling_params.structured_outputs is not None:
+            if self.structured_output_request is not None:
                 self.status = RequestStatus.WAITING_FOR_FSM
-                self.use_structured_output = True
 
             if sampling_params.extra_args is not None:
                 self.kv_transfer_params = sampling_params.extra_args.get(
@@ -145,11 +144,6 @@ class Request:
             eos_token_id=request.eos_token_id,
             arrival_time=request.arrival_time,
             lora_request=request.lora_request,
-            structured_output_request=StructuredOutputRequest(
-                sampling_params=request.sampling_params
-            )
-            if request.sampling_params
-            else None,
             cache_salt=request.cache_salt,
             priority=request.priority,
             trace_headers=request.trace_headers,
@@ -170,6 +164,10 @@ class Request:
         if self.get_hash_new_full_blocks is not None:
             self.block_hashes.extend(self.get_hash_new_full_blocks())
 
+    @property
+    def use_structured_output(self) -> bool:
+        return self.structured_output_request is not None
+
     @property
     def is_output_corrupted(self) -> bool:
         return self.num_nans_in_logits > 0
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 336a0eb98682a..8d7f4b5d68961 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -167,7 +167,7 @@ class StructuredOutputManager:
     def grammar_bitmask(
         self,
         requests: dict[str, Request],
-        structured_output_request_ids: dict[str, int],
+        structured_output_request_ids: list[str],
         scheduled_spec_decode_tokens: dict[str, list[int]],
     ) -> "npt.NDArray[np.int32] | None":
         # Prepare the structured output bitmask for this batch.
@@ -196,17 +196,16 @@ class StructuredOutputManager:
         # masks for each request, one for each possible bonus token position.
         # These are stored inline in the tensor and unpacked by the gpu runner.
         cumulative_index = 0
-        ordered_seq = sorted(structured_output_request_ids.items(), key=lambda x: x[1])
 
         # Optimized parallel filling of bitmasks for
         # non-spec, large-batch-size cases
         if (
-            len(ordered_seq) > self.fill_bitmask_parallel_threshold
+            len(structured_output_request_ids) > self.fill_bitmask_parallel_threshold
             and max_num_spec_tokens == 0
         ):
             promises = []
             batch = []
-            for req_id, _ in ordered_seq:
+            for req_id in structured_output_request_ids:
                 request = requests[req_id]
                 structured_output_request = request.structured_output_request
                 if TYPE_CHECKING:
@@ -230,7 +229,7 @@ class StructuredOutputManager:
                 promise.result()
         else:
             # Fallback to serial filling of bitmasks for small-batch-size cases
-            for req_id, _ in ordered_seq:
+            for req_id in structured_output_request_ids:
                 request = requests[req_id]
                 structured_output_request = request.structured_output_request
 
@@ -295,22 +294,21 @@ class StructuredOutputManager:
             assert request.structured_output_request.grammar is not None
         # by default, we should always advance
         # for cases that don't use thinking mode.
-        if self.reasoner is not None:
-            structured_req = request.structured_output_request
-
-            if structured_req.reasoning_ended:
-                return True
-
-            # Check if reasoning ends in *this* step
-            if self.reasoner.is_reasoning_end(request.all_token_ids):
-                # Reasoning just ended, so we shouldn't advance til
-                # next pass
-                structured_req.reasoning_ended = True
-
-            return False
-        else:
+        if self.reasoner is None:
             return True
 
+        structured_req = request.structured_output_request
+        if structured_req.reasoning_ended:
+            return True
+
+        # Check if reasoning ends in *this* step
+        if self.reasoner.is_reasoning_end(request.all_token_ids):
+            # Reasoning just ended, so we shouldn't advance til
+            # next pass
+            structured_req.reasoning_ended = True
+
+        return False
+
     def clear_backend(self) -> None:
         if self.backend is not None:
             self.backend.destroy()
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index c37193e667aab..8e75b99f8481f 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -252,7 +252,7 @@ def serialize_guidance_grammar(
 def validate_guidance_grammar(
     sampling_params: SamplingParams, tokenizer: llguidance.LLTokenizer | None = None
 ) -> None:
-    tp, grm = get_structured_output_key(sampling_params)
+    tp, grm = get_structured_output_key(sampling_params.structured_outputs)
     guidance_grm = serialize_guidance_grammar(tp, grm)
     err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer)
     if err:
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index 9e149b186c639..afe0e4b3f3a7f 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -7,7 +7,7 @@ from concurrent.futures import Future
 from concurrent.futures._base import TimeoutError
 from typing import cast
 
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.structured_output.backend_types import (
     StructuredOutputGrammar,
     StructuredOutputKey,
@@ -17,10 +17,19 @@ from vllm.v1.structured_output.backend_types import (
 
 @dataclasses.dataclass
 class StructuredOutputRequest:
-    sampling_params: SamplingParams
+    params: StructuredOutputsParams
     _grammar: Future[StructuredOutputGrammar] | StructuredOutputGrammar | None = None
     reasoning_ended: bool | None = None
 
+    @staticmethod
+    def from_sampling_params(
+        sampling_params: SamplingParams | None,
+    ) -> "StructuredOutputRequest | None":
+        if sampling_params is None:
+            return None
+        params = sampling_params.structured_outputs
+        return StructuredOutputRequest(params=params) if params else None
+
     def _check_grammar_completion(self) -> bool:
         # NOTE: We have to lazy import to gate circular imports
         from vllm.v1.request import RequestStatus
@@ -53,31 +62,28 @@ class StructuredOutputRequest:
 
     @functools.cached_property
     def structured_output_key(self) -> StructuredOutputKey:
-        return get_structured_output_key(self.sampling_params)
+        return get_structured_output_key(self.params)
 
 
-def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutputKey:
-    params = sampling_params.structured_outputs
-    assert params is not None, "params can't be None."
+def get_structured_output_key(params: StructuredOutputsParams) -> StructuredOutputKey:
     if params.json is not None:
         if not isinstance(params.json, str):
             json_str = json.dumps(params.json)
         else:
             json_str = params.json
-        return (StructuredOutputOptions.JSON, json_str)
-    elif params.json_object:
-        return (StructuredOutputOptions.JSON_OBJECT, "")
-    elif params.regex is not None:
-        return (StructuredOutputOptions.REGEX, params.regex)
-    elif params.choice is not None:
+        return StructuredOutputOptions.JSON, json_str
+    if params.json_object:
+        return StructuredOutputOptions.JSON_OBJECT, ""
+    if params.regex is not None:
+        return StructuredOutputOptions.REGEX, params.regex
+    if params.choice is not None:
         if not isinstance(params.choice, str):
             json_str = json.dumps(params.choice)
         else:
             json_str = params.choice
-        return (StructuredOutputOptions.CHOICE, json_str)
-    elif params.grammar is not None:
-        return (StructuredOutputOptions.GRAMMAR, params.grammar)
-    elif params.structural_tag is not None:
-        return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag)
-    else:
-        raise ValueError("No valid structured output parameter found")
+        return StructuredOutputOptions.CHOICE, json_str
+    if params.grammar is not None:
+        return StructuredOutputOptions.GRAMMAR, params.grammar
+    if params.structural_tag is not None:
+        return StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag
+    raise ValueError("No valid structured output parameter found")
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 2520dc217c798..4b793b9a72fd7 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -47,7 +47,6 @@ def apply_grammar_bitmask(
     scheduler_output: SchedulerOutput,
     input_batch: InputBatch,
     logits: torch.Tensor,
-    device: torch.device,
 ) -> None:
     """
     Apply grammar bitmask to output logits of the model with xgrammar function.
@@ -56,7 +55,6 @@ def apply_grammar_bitmask(
         scheduler_output (SchedulerOutput): The result of engine scheduling.
         input_batch (InputBatch): The input of model runner.
         logits (torch.Tensor): The output logits of model forward.
-        device (torch.device): The device that model runner running on.
     """
     grammar_bitmask = scheduler_output.grammar_bitmask
     if grammar_bitmask is None:
@@ -91,10 +89,7 @@ def apply_grammar_bitmask(
         dtype=grammar_bitmask.dtype,
     )
     cumulative_index = 0
-    seq = sorted(
-        scheduler_output.structured_output_request_ids.items(), key=lambda x: x[1]
-    )
-    for req_id, _ in seq:
+    for req_id in scheduler_output.structured_output_request_ids:
         num_spec_tokens = len(
             scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
         )
@@ -117,7 +112,7 @@ def apply_grammar_bitmask(
 
     xgr.apply_token_bitmask_inplace(
         logits,
-        grammar_bitmask.to(device, non_blocking=True),
+        grammar_bitmask.to(logits.device, non_blocking=True),
         indices=out_indices if not skip_out_indices else None,
     )
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bbb63d28289c4..72f8824e20054 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2568,10 +2568,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 logits = model_output_broadcast_data["logits"]
 
             # Apply structured output bitmasks if present
-            if scheduler_output.grammar_bitmask is not None:
-                apply_grammar_bitmask(
-                    scheduler_output, self.input_batch, logits, self.device
-                )
+            if scheduler_output.structured_output_request_ids:
+                apply_grammar_bitmask(scheduler_output, self.input_batch, logits)
 
         with record_function_or_nullcontext("Sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 828f09cbc8d8d..2107df5fc1032 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1963,12 +1963,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.grammar_bitmask_cpu.zero_()
         self.require_structured_out_cpu.zero_()
 
-        sorted_struct_requests = sorted(
-            scheduler_output.structured_output_request_ids.items(),
-            key=lambda item: item[1],
-        )
         cumulative_mask_idx = 0
-        for req_id, _ in sorted_struct_requests:
+        for req_id in scheduler_output.structured_output_request_ids:
             if req_id not in self.input_batch.req_id_to_index:
                 continue
             batch_index = self.input_batch.req_id_to_index[req_id]

From 7e0ef4084affa9de84904ba7726c46f53f4f6379 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 14 Oct 2025 19:41:43 -0400
Subject: [PATCH 03/51] [CI Failure] Fix torchao dep failure for Quantization
 Test (#26824)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-amd.yaml                       | 3 ++-
 .buildkite/test-pipeline.yaml                  | 3 ++-
 tests/quantization/test_compressed_tensors.py  | 3 ++-
 vllm/model_executor/layers/quantization/rtn.py | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index b2a3a0a775baa..91f0b850575c4 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -603,7 +603,8 @@ steps:
   # since torchao nightly is only compatible with torch nightly currently
   # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
   # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
 
 - label: LM Eval Small Models # 53min
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ebe0602a1b5db..94c0944c838ce 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -527,7 +527,8 @@ steps:
   # since torchao nightly is only compatible with torch nightly currently
   # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
   # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
 
 - label: LM Eval Small Models # 53min
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index ef7164c8813da..5aeb002238cf9 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -697,7 +697,8 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
 @pytest.mark.parametrize(
     "args",
     [
-        ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
+        # TODO: Enable once model is available again
+        # ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
         ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
     ],
 )
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index c041d2fd0ba48..e4f7ff8339569 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
 )
+from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -396,7 +397,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
             indices_type=self.topk_indices_dtype,
         )
 
-        return torch.ops.vllm.fused_marlin_moe(
+        return fused_marlin_moe(
             x,
             layer.w13_weight,
             layer.w2_weight,

From 0512c04aee408367a068b5960e7857c722ed204d Mon Sep 17 00:00:00 2001
From: Ye Hu <hyelacora@gmail.com>
Date: Tue, 14 Oct 2025 16:48:13 -0700
Subject: [PATCH 04/51] [frontend][gptoss] Add per turn stats into Harmony
 Context (#25061)

Signed-off-by: lacora <hyelacora@gmail.com>
Co-authored-by: Ye Hu <yehu@fb.com>
---
 tests/entrypoints/test_context.py            | 93 ++++++++++++++++++--
 vllm/entrypoints/context.py                  | 65 +++++++++-----
 vllm/entrypoints/openai/protocol.py          |  4 +
 vllm/entrypoints/openai/serving_responses.py | 88 +++++++++++-------
 4 files changed, 188 insertions(+), 62 deletions(-)

diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py
index b0faa870a9272..31ea856224f90 100644
--- a/tests/entrypoints/test_context.py
+++ b/tests/entrypoints/test_context.py
@@ -6,7 +6,11 @@ from unittest.mock import MagicMock, patch
 import pytest
 from openai_harmony import Author, Message, Role, StreamState, TextContent
 
-from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext
+from vllm.entrypoints.context import (
+    HarmonyContext,
+    StreamingHarmonyContext,
+    TurnMetrics,
+)
 from vllm.outputs import CompletionOutput, RequestOutput
 
 
@@ -101,8 +105,12 @@ def test_single_turn_token_counting():
 
     # Verify internal state tracking
     assert not context.is_first_turn
-    assert context.previous_turn.input_tokens == 5
-    assert context.previous_turn.output_tokens == 3
+    assert len(context.all_turn_metrics) == 1
+    previous_turn = context.all_turn_metrics[0]
+    assert previous_turn.input_tokens == 5
+    assert previous_turn.output_tokens == 3
+    assert previous_turn.cached_input_tokens == 2
+    assert previous_turn.tool_output_tokens == 0
 
 
 @pytest.mark.asyncio
@@ -156,6 +164,15 @@ async def test_multi_turn_token_counting():
     assert context.num_tool_output_tokens == expected_tool_output
     assert context.num_cached_tokens == 5 + 15
 
+    # Validate all turn metrics
+    assert len(context.all_turn_metrics) == 3
+    for i, turn in enumerate(context.all_turn_metrics):
+        assert turn.input_tokens == prompt_token_counts[i]
+        assert turn.output_tokens == output_token_counts[i]
+        assert turn.cached_input_tokens == cached_token_counts[i]
+    assert context.all_turn_metrics[1].tool_output_tokens == 7
+    assert context.all_turn_metrics[2].tool_output_tokens == 1
+
 
 def test_empty_output_tokens():
     """Test behavior when RequestOutput has empty output tokens."""
@@ -314,6 +331,10 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
     # Create a streaming context
     context = StreamingHarmonyContext(messages=[], available_tools=["browser"])
 
+    num_prompt_tokens = [3, 8, 13]
+    num_output_tokens = [3, 3, 2]
+    num_cached_tokens = [0, 3, 8]
+
     # Simulate three turns of conversation:
     # Turn 1: stream tokens one by one, then finish the message
     # Turn 2: new prompt, stream more tokens with a reasoning segment
@@ -325,7 +346,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
         create_mock_request_output(
             prompt_token_ids=[1, 2, 3],  # 3 prompt tokens
             output_token_ids=[101],  # Single token
-            num_cached_tokens=0,
+            num_cached_tokens=num_cached_tokens[0],
             finished=False,  # Not end of message yet
         )
     )
@@ -370,7 +391,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
                 5,
             ],  # 8 tokens (includes previous)
             output_token_ids=[201],
-            num_cached_tokens=3,  # Some tokens cached
+            num_cached_tokens=num_cached_tokens[1],  # Some tokens cached
             finished=False,
         )
     )
@@ -422,7 +443,7 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
                 7,
             ],  # 13 tokens
             output_token_ids=[301],
-            num_cached_tokens=8,  # More cached tokens
+            num_cached_tokens=num_cached_tokens[2],  # More cached tokens
             finished=False,
         )
     )
@@ -435,10 +456,12 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
     )
 
     # Final token counts check
-    assert context.num_prompt_tokens == 3 + 8 + 13  # All prompts
-    assert context.num_output_tokens == 3 + 3 + 2  # All outputs
+    assert context.num_prompt_tokens == sum(num_prompt_tokens)  # All prompts
+    assert context.num_output_tokens == sum(num_output_tokens)  # All outputs
     assert context.num_reasoning_tokens == 3  # Unchanged from second turn
-    assert context.num_cached_tokens == 3 + 8  # Accumulated cached tokens
+    assert context.num_cached_tokens == sum(
+        num_cached_tokens
+    )  # Accumulated cached tokens
 
     # Additional tool tokens from third turn
     # Formula: this turn prompt - last turn prompt - last turn output
@@ -447,6 +470,15 @@ async def test_streaming_multi_turn_token_counting(mock_parser):
         context.num_tool_output_tokens == expected_tool_tokens + additional_tool_tokens
     )
 
+    # Validate all turn metrics
+    assert len(context.all_turn_metrics) == 3
+    for i, turn in enumerate(context.all_turn_metrics):
+        assert turn.input_tokens == num_prompt_tokens[i]
+        assert turn.output_tokens == num_output_tokens[i]
+        assert turn.cached_input_tokens == num_cached_tokens[i]
+    assert context.all_turn_metrics[1].tool_output_tokens == 2
+    assert context.all_turn_metrics[2].tool_output_tokens == 2
+
 
 @pytest.mark.asyncio
 async def test_streaming_message_synchronization(mock_parser):
@@ -522,3 +554,46 @@ async def test_streaming_message_synchronization(mock_parser):
     assert len(context._messages) == 3
     assert context.num_init_messages == 1
     assert context._messages[2].content[0].text == "Response 4"
+
+
+def test_turn_metrics_copy_and_reset():
+    """Test TurnMetrics copy and reset methods work correctly."""
+    # Create a TurnMetrics with specific values
+    original_metrics = TurnMetrics(
+        input_tokens=10,
+        output_tokens=20,
+        cached_input_tokens=5,
+        tool_output_tokens=3,
+    )
+
+    # Test copy functionality
+    copied_metrics = original_metrics.copy()
+
+    # Verify copy has same values
+    assert copied_metrics.input_tokens == 10
+    assert copied_metrics.output_tokens == 20
+    assert copied_metrics.cached_input_tokens == 5
+    assert copied_metrics.tool_output_tokens == 3
+
+    # Verify they are separate objects
+    assert copied_metrics is not original_metrics
+
+    # Modify copy to ensure independence
+    copied_metrics.input_tokens = 999
+    assert original_metrics.input_tokens == 10  # Original unchanged
+    assert copied_metrics.input_tokens == 999
+
+    # Test reset functionality
+    original_metrics.reset()
+
+    # Verify all fields are reset to zero
+    assert original_metrics.input_tokens == 0
+    assert original_metrics.output_tokens == 0
+    assert original_metrics.cached_input_tokens == 0
+    assert original_metrics.tool_output_tokens == 0
+
+    # Verify copied metrics are unaffected by reset
+    assert copied_metrics.input_tokens == 999
+    assert copied_metrics.output_tokens == 20
+    assert copied_metrics.cached_input_tokens == 5
+    assert copied_metrics.tool_output_tokens == 3
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index c694bcfaaa756..8f94880e431be 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -45,21 +45,36 @@ def _map_tool_name_to_tool_type(tool_name: str) -> str:
     return _TOOL_NAME_TO_TYPE_MAP[tool_name]
 
 
-class TurnTokens:
-    """Tracks token counts for a single conversation turn."""
+class TurnMetrics:
+    """Tracks token and toolcall details for a single conversation turn."""
 
-    def __init__(self, input_tokens=0, output_tokens=0):
+    def __init__(
+        self,
+        input_tokens=0,
+        output_tokens=0,
+        cached_input_tokens=0,
+        tool_output_tokens=0,
+    ):
         self.input_tokens = input_tokens
         self.output_tokens = output_tokens
+        self.cached_input_tokens = cached_input_tokens
+        self.tool_output_tokens = tool_output_tokens
 
     def reset(self):
         """Reset counters for a new turn."""
         self.input_tokens = 0
         self.output_tokens = 0
+        self.cached_input_tokens = 0
+        self.tool_output_tokens = 0
 
     def copy(self):
         """Create a copy of this turn's token counts."""
-        return TurnTokens(self.input_tokens, self.output_tokens)
+        return TurnMetrics(
+            self.input_tokens,
+            self.output_tokens,
+            self.cached_input_tokens,
+            self.tool_output_tokens,
+        )
 
 
 class ConversationContext(ABC):
@@ -102,6 +117,8 @@ class SimpleContext(ConversationContext):
         self.num_cached_tokens = 0
         # todo num_reasoning_tokens is not implemented yet.
         self.num_reasoning_tokens = 0
+        # not implemented yet for SimpleContext
+        self.all_turn_metrics = []
 
     def append_output(self, output) -> None:
         self.last_output = output
@@ -154,8 +171,9 @@ class HarmonyContext(ConversationContext):
         self.num_tool_output_tokens = 0
 
         # Turn tracking - replaces multiple individual tracking variables
-        self.current_turn = TurnTokens()
-        self.previous_turn = TurnTokens()
+        self.current_turn_metrics = TurnMetrics()
+        # Track metrics for all turns
+        self.all_turn_metrics: list[TurnMetrics] = []
         self.is_first_turn = True
         self.first_tok_of_message = True  # For streaming support
 
@@ -173,11 +191,10 @@ class HarmonyContext(ConversationContext):
                 # Check if the current token is part of reasoning content
                 self._update_num_reasoning_tokens()
             self._update_prefill_token_usage(output)
-            # Reset current turn output tokens for this turn
-            self.current_turn.output_tokens = 0
             self._update_decode_token_usage(output)
-            # Move current turn to previous turn for next turn's calculations
-            self.previous_turn = self.current_turn.copy()
+            # Append current turn to all turn list for next turn's calculations
+            self.all_turn_metrics.append(self.current_turn_metrics.copy())
+            self.current_turn_metrics.reset()
             # append_output is called only once before tool calling
             # in non-streaming case
             # so we can append all the parser messages to _messages
@@ -213,20 +230,21 @@ class HarmonyContext(ConversationContext):
             logger.error("RequestOutput appended contains no prompt_token_ids.")
 
         # Update current turn input tokens
-        self.current_turn.input_tokens = this_turn_input_tokens
+        self.current_turn_metrics.input_tokens = this_turn_input_tokens
         self.num_prompt_tokens += this_turn_input_tokens
 
         # Calculate tool tokens (except on first turn)
         if self.is_first_turn:
             self.is_first_turn = False
         else:
+            previous_turn = self.all_turn_metrics[-1]
             # start counting tool after first turn
             # tool tokens = this turn prefill - last turn prefill -
             # last turn decode
             this_turn_tool_tokens = (
-                self.current_turn.input_tokens
-                - self.previous_turn.input_tokens
-                - self.previous_turn.output_tokens
+                self.current_turn_metrics.input_tokens
+                - previous_turn.input_tokens
+                - previous_turn.output_tokens
             )
 
             # Handle negative tool token counts (shouldn't happen in normal
@@ -237,17 +255,20 @@ class HarmonyContext(ConversationContext):
                     "(current_input=%d, previous_input=%d, "
                     "previous_output=%d). Setting to 0.",
                     this_turn_tool_tokens,
-                    self.current_turn.input_tokens,
-                    self.previous_turn.input_tokens,
-                    self.previous_turn.output_tokens,
+                    self.current_turn_metrics.input_tokens,
+                    previous_turn.input_tokens,
+                    previous_turn.output_tokens,
                 )
                 this_turn_tool_tokens = 0
 
             self.num_tool_output_tokens += this_turn_tool_tokens
+            self.current_turn_metrics.tool_output_tokens = this_turn_tool_tokens
 
         # Update cached tokens
-        if output.num_cached_tokens is not None:
-            self.num_cached_tokens += output.num_cached_tokens
+        num_cached_token = output.num_cached_tokens
+        if num_cached_token is not None:
+            self.num_cached_tokens += num_cached_token
+            self.current_turn_metrics.cached_input_tokens = num_cached_token
 
     def _update_decode_token_usage(self, output: RequestOutput) -> int:
         """Update token usage statistics for the decode phase of generation.
@@ -272,7 +293,7 @@ class HarmonyContext(ConversationContext):
                 # only keep last round
                 updated_output_token_count += len(completion_output.token_ids)
             self.num_output_tokens += updated_output_token_count
-            self.current_turn.output_tokens += updated_output_token_count
+            self.current_turn_metrics.output_tokens += updated_output_token_count
         return updated_output_token_count
 
     @property
@@ -452,7 +473,6 @@ class StreamingHarmonyContext(HarmonyContext):
             # so we only want to add the prompt tokens once for each message.
             if self.first_tok_of_message:
                 self._update_prefill_token_usage(output)
-                self.current_turn.output_tokens = 0
             # Reset self.first_tok_of_message if needed:
             # if the current token is the last one of the current message
             # (finished=True), then the next token processed will mark the
@@ -464,7 +484,8 @@ class StreamingHarmonyContext(HarmonyContext):
 
             # For streaming, update previous turn when message is complete
             if output.finished:
-                self.previous_turn = self.current_turn.copy()
+                self.all_turn_metrics.append(self.current_turn_metrics.copy())
+                self.current_turn_metrics.reset()
             # Check if the current token is part of reasoning content
             self._update_num_reasoning_tokens()
             self.last_tok = tok
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index f41fa196acd81..86e1e62ff437b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -2103,11 +2103,15 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
 
 class InputTokensDetails(OpenAIBaseModel):
     cached_tokens: int
+    input_tokens_per_turn: list[int] = Field(default_factory=list)
+    cached_tokens_per_turn: list[int] = Field(default_factory=list)
 
 
 class OutputTokensDetails(OpenAIBaseModel):
     reasoning_tokens: int = 0
     tool_output_tokens: int = 0
+    output_tokens_per_turn: list[int] = Field(default_factory=list)
+    tool_output_tokens_per_turn: list[int] = Field(default_factory=list)
 
 
 class ResponseUsage(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 51e2856a5a9dd..6cdabff6e709b 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -589,10 +589,24 @@ class OpenAIServingResponses(OpenAIServing):
             input_tokens=num_prompt_tokens,
             output_tokens=num_generated_tokens,
             total_tokens=num_prompt_tokens + num_generated_tokens,
-            input_tokens_details=InputTokensDetails(cached_tokens=num_cached_tokens),
+            input_tokens_details=InputTokensDetails(
+                cached_tokens=num_cached_tokens,
+                input_tokens_per_turn=[
+                    turn.input_tokens for turn in context.all_turn_metrics
+                ],
+                cached_tokens_per_turn=[
+                    turn.cached_input_tokens for turn in context.all_turn_metrics
+                ],
+            ),
             output_tokens_details=OutputTokensDetails(
                 reasoning_tokens=num_reasoning_tokens,
                 tool_output_tokens=num_tool_output_tokens,
+                output_tokens_per_turn=[
+                    turn.output_tokens for turn in context.all_turn_metrics
+                ],
+                tool_output_tokens_per_turn=[
+                    turn.tool_output_tokens for turn in context.all_turn_metrics
+                ],
             ),
         )
         response = ResponsesResponse.from_request(
@@ -665,11 +679,13 @@ class OpenAIServingResponses(OpenAIServing):
                     token=text,
                     logprob=max(token_logprob.logprob, -9999.0),
                     bytes=list(text.encode("utf-8", errors="replace")),
-                    top_logprobs=self._topk_logprobs(
-                        logprob, top_logprobs=top_logprobs, tokenizer=tokenizer
-                    )
-                    if top_logprobs
-                    else [],
+                    top_logprobs=(
+                        self._topk_logprobs(
+                            logprob, top_logprobs=top_logprobs, tokenizer=tokenizer
+                        )
+                        if top_logprobs
+                        else []
+                    ),
                 )
             )
         return out
@@ -758,14 +774,16 @@ class OpenAIServingResponses(OpenAIServing):
                 text=content,
                 annotations=[],  # TODO
                 type="output_text",
-                logprobs=self._create_response_logprobs(
-                    token_ids=final_output.token_ids,
-                    logprobs=final_output.logprobs,
-                    tokenizer=tokenizer,
-                    top_logprobs=request.top_logprobs,
-                )
-                if request.is_include_output_logprobs()
-                else None,
+                logprobs=(
+                    self._create_response_logprobs(
+                        token_ids=final_output.token_ids,
+                        logprobs=final_output.logprobs,
+                        tokenizer=tokenizer,
+                        top_logprobs=request.top_logprobs,
+                    )
+                    if request.is_include_output_logprobs()
+                    else None
+                ),
             )
             message = ResponseOutputMessage(
                 id=f"msg_{random_uuid()}",
@@ -870,15 +888,21 @@ class OpenAIServingResponses(OpenAIServing):
             with_custom_tools = has_custom_tools(tool_types)
             sys_msg = get_system_message(
                 reasoning_effort=reasoning_effort,
-                browser_description=self.tool_server.get_tool_description("browser")
-                if enable_browser and self.tool_server is not None
-                else None,
-                python_description=self.tool_server.get_tool_description("python")
-                if enable_code_interpreter and self.tool_server is not None
-                else None,
-                container_description=self.tool_server.get_tool_description("container")
-                if enable_container and self.tool_server is not None
-                else None,
+                browser_description=(
+                    self.tool_server.get_tool_description("browser")
+                    if enable_browser and self.tool_server is not None
+                    else None
+                ),
+                python_description=(
+                    self.tool_server.get_tool_description("python")
+                    if enable_code_interpreter and self.tool_server is not None
+                    else None
+                ),
+                container_description=(
+                    self.tool_server.get_tool_description("container")
+                    if enable_container and self.tool_server is not None
+                    else None
+                ),
                 instructions=request.instructions,
                 with_custom_tools=with_custom_tools,
             )
@@ -1283,14 +1307,16 @@ class OpenAIServingResponses(OpenAIServing):
                             output_index=current_output_index,
                             item_id=current_item_id,
                             delta=delta_message.content,
-                            logprobs=self._create_stream_response_logprobs(
-                                token_ids=output.token_ids,
-                                logprobs=output.logprobs,
-                                tokenizer=tokenizer,
-                                top_logprobs=request.top_logprobs,
-                            )
-                            if request.is_include_output_logprobs()
-                            else [],
+                            logprobs=(
+                                self._create_stream_response_logprobs(
+                                    token_ids=output.token_ids,
+                                    logprobs=output.logprobs,
+                                    tokenizer=tokenizer,
+                                    top_logprobs=request.top_logprobs,
+                                )
+                                if request.is_include_output_logprobs()
+                                else []
+                            ),
                         )
                     )
                 current_content_index += 1

From 579d2e5458b19c442f48e0cba0ba71c5d4abf6ea Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Date: Tue, 14 Oct 2025 19:51:54 -0400
Subject: [PATCH 05/51] [WideEP][P/D] Add usage stats for DP+EP and KV
 Connector (#26836)

Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
---
 vllm/v1/utils.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index f03efe21098bf..6aebe295b5ce5 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -345,13 +345,17 @@ def report_usage_stats(
 
     parallel_config = vllm_config.parallel_config
 
+    # Prepare KV connector string if applicable
+    kv_connector = None
+    if vllm_config.kv_transfer_config is not None:
+        kv_connector = vllm_config.kv_transfer_config.kv_connector
+
     usage_message.report_usage(
         get_architecture_class_name(vllm_config.model_config),
         usage_context,
         extra_kvs={
             # Common configuration
             "dtype": str(vllm_config.model_config.dtype),
-            "tensor_parallel_size": parallel_config.tensor_parallel_size,
             "block_size": vllm_config.cache_config.block_size,
             "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
             "kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
@@ -363,6 +367,15 @@ def report_usage_stats(
             "enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching,
             "enforce_eager": vllm_config.model_config.enforce_eager,
             "disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
+            # Distributed parallelism settings
+            "tensor_parallel_size": parallel_config.tensor_parallel_size,
+            "data_parallel_size": parallel_config.data_parallel_size,
+            "pipeline_parallel_size": parallel_config.pipeline_parallel_size,
+            "enable_expert_parallel": parallel_config.enable_expert_parallel,
+            # All2All backend for MoE expert parallel
+            "all2all_backend": parallel_config.all2all_backend,
+            # KV connector used
+            "kv_connector": kv_connector,
         },
     )
 

From 2dcd12d3571b070432ad1cd321a67b840b4a34b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Tue, 14 Oct 2025 19:55:02 -0400
Subject: [PATCH 06/51] [torch.compile] Fix tests for torch==2.9 inductor
 partition (#26116)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: ProExpertProg <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
---
 .../compile/piecewise/test_full_cudagraph.py  |  29 +++--
 .../compile/piecewise/test_multiple_graphs.py |  38 ++++--
 tests/compile/piecewise/test_toy_llama.py     | 117 +++++++++++-------
 tests/compile/silly_attention.py              |   1 -
 tests/compile/test_decorator.py               |   3 +
 vllm/attention/layer.py                       |   6 -
 vllm/compilation/partition_rules.py           |  13 +-
 vllm/config/compilation.py                    |   3 +-
 8 files changed, 138 insertions(+), 72 deletions(-)

diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index 84194f3ed01e8..e01b58220959f 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -11,6 +11,7 @@ from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig
 from vllm.platforms import current_platform
+from vllm.utils import is_torch_equal_or_newer
 
 
 @contextlib.contextmanager
@@ -32,13 +33,13 @@ def temporary_environ(env_vars):
                 os.environ[k] = v
 
 
-test_params_full_cudagraph = []
+model_backends_full_cudagraph = []
 
 # deepseek-ai/DeepSeek-V2-Lite with MLA
 MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
 for mla_backend in MLA_backends:
-    test_params_full_cudagraph.append(
-        pytest.param(("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend]))
+    model_backends_full_cudagraph.append(
+        ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])
     )
 
 # Qwen/Qwen2-1.5B-Instruct with other backends
@@ -46,14 +47,18 @@ other_backend_configs = [
     backend_configs[c] for c in backend_configs if c not in MLA_backends
 ]
 for backend_config in other_backend_configs:
-    test_params_full_cudagraph.append(
-        pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config))
-    )
+    model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config))
 
 
 @pytest.fixture(scope="class")
 def llm_pair(request):
-    model, backend_config = request.param
+    model, backend_config, use_inductor_graph_partition = request.param
+    backend_config.comp_config["use_inductor_graph_partition"] = (
+        use_inductor_graph_partition
+    )
+
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition only supported in torch>=2.9")
 
     # Dynamically skip test if GPU capability is not met
     if (
@@ -104,7 +109,15 @@ def llm_pair(request):
     )
 
 
-@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True)
+@pytest.mark.parametrize(
+    "llm_pair",
+    [
+        pytest.param((model, backend_config, use_inductor_graph_partition))
+        for model, backend_config in model_backends_full_cudagraph
+        for use_inductor_graph_partition in [True, False]
+    ],
+    indirect=True,
+)
 class TestFullCUDAGraph:
     """
     Use a class such that an llm pair is constructed once for all
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
index d88645e3bfd62..0d265bc596386 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -5,6 +5,7 @@ Test (piecewise) compilation with a simple model where multiple submodules
 are compiled and graph captured separately.
 """
 
+import pytest
 import torch
 from torch import nn
 
@@ -190,7 +191,12 @@ def run_model(
         return output.cpu()
 
 
-def test_multi_graph_piecewise_compile_outputs_equal():
+@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
+def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
+    if use_inductor_graph_partition:
+        # FIXME(luka/boyuan): this currently fails
+        pytest.skip("Inductor graph partition not supported with multi-graph")
+
     outputs = []
 
     # piecewise compile
@@ -200,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
+            use_inductor_graph_partition=use_inductor_graph_partition,
         )
     )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -220,16 +227,24 @@ def test_multi_graph_piecewise_compile_outputs_equal():
     # static tensor addresses
     inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
 
-    with compilation_counter.expect(
-        num_graphs_seen=2,  # two graphs for the model
-        num_piecewise_graphs_seen=6,
+    if use_inductor_graph_partition:
+        # Splitting happens at Inductor lowering level,
+        # total piecewise fx graphs is equal to total graphs
+        num_piecewise_fx = 2
+        num_piecewise_capturable_fx = 2
+    else:
         # attn_one, attn_two each has 3 piecewise graphs
         # (pre attn, post attn, silly_attention) each
-        num_piecewise_capturable_graphs_seen=4,
+        num_piecewise_fx = 6
         # attn_one, attn_two has pre attn and post attn each, total=4
-        num_backend_compilations=4,  # num_piecewise_capturable_graphs_seen
-        num_cudagraph_captured=8,
-        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_piecewise_capturable_fx = 4
+
+    with compilation_counter.expect(
+        num_graphs_seen=2,  # two graphs for the model
+        num_piecewise_graphs_seen=num_piecewise_fx,
+        num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+        num_backend_compilations=num_piecewise_capturable_fx,
+        num_cudagraph_captured=8,  # num_cudagraph_sizes * num_partitions
     ):
         outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
@@ -268,6 +283,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
             level=CompilationLevel.PIECEWISE,
             use_cudagraph=False,
             splitting_ops=["silly::attention"],
+            use_inductor_graph_partition=use_inductor_graph_partition,
         )
     )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -286,9 +302,9 @@ def test_multi_graph_piecewise_compile_outputs_equal():
 
     with compilation_counter.expect(
         num_graphs_seen=2,
-        num_piecewise_graphs_seen=6,
-        num_piecewise_capturable_graphs_seen=4,
-        num_backend_compilations=4,
+        num_piecewise_graphs_seen=num_piecewise_fx,
+        num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+        num_backend_compilations=num_piecewise_capturable_fx,
         num_cudagraph_captured=0,  # no cudagraph captured
     ):
         outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index eaf0a15479e97..7ab610fa78115 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -9,6 +9,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
 initialized randomly with a fixed seed.
 """
 
+from copy import deepcopy
 from dataclasses import dataclass
 from typing import Any
 
@@ -26,6 +27,7 @@ from vllm.config import (
     set_current_vllm_config,
 )
 from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils import is_torch_equal_or_newer
 
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
@@ -257,27 +259,13 @@ def tractable_computation(
 
 
 @torch.inference_mode
-def run_model(
-    llama_config, use_compile: bool, backend: str, split_attn: bool = False
-) -> torch.Tensor:
-    if use_compile:
-        compilation_config = CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
-            use_cudagraph=True,
-            backend=backend,
-            cudagraph_capture_sizes=[1, 2],
-        )
-        if split_attn:
-            compilation_config.splitting_ops = ["silly::attention"]
-        cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
-    else:
-        compilation_config = CompilationConfig(
-            level=CompilationLevel.NO_COMPILATION,
-        )
-        cudagraph_runtime_mode = CUDAGraphMode.NONE
+def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
+    # Start with a fresh copy to make sure there's no cache dir sharing
+    compile_config = deepcopy(compile_config)
+    cudagraph_runtime_mode = compile_config.cudagraph_mode
 
     vllm_config = VllmConfig(
-        compilation_config=compilation_config, additional_config=llama_config
+        compilation_config=compile_config, additional_config=llama_config
     )
     with set_current_vllm_config(vllm_config):
         model = (
@@ -338,8 +326,25 @@ def run_model(
             return output.cpu()
 
 
-@pytest.mark.parametrize("backend", ["inductor", "eager"])
-def test_toy_llama(backend: str):
+@pytest.mark.parametrize(
+    "backend, use_inductor_graph_partition",
+    [
+        ("eager", False),  # No inductor
+        ("inductor", False),  # Inductor, Dynamo partition
+        ("inductor", True),  # Inductor, Inductor partition
+    ],
+)
+def test_toy_llama(
+    backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
+):
+    # We disable the vLLM compile cache into a new tmp dir for 2 reasons:
+    # 1. To make sure we can properly track the number of Inductor compilations.
+    # 2. Inductor partitioning does not play nicely with Autograd cache (below)
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition only supported in torch>=2.9")
+
     # compare output with and without piecewise compilation
 
     llama_config = LlamaConfig(
@@ -350,6 +355,32 @@ def test_toy_llama(backend: str):
         hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True
     )
 
+    compile_config_no_compile = CompilationConfig(
+        level=CompilationLevel.NO_COMPILATION,
+        cudagraph_mode=CUDAGraphMode.NONE,
+        backend="eager",
+    )
+
+    compile_config_no_split = CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        use_inductor_graph_partition=use_inductor_graph_partition,
+        cudagraph_mode=CUDAGraphMode.PIECEWISE,
+        backend=backend,
+        cudagraph_capture_sizes=[1, 2],
+    )
+
+    # FIXME(luka/boyuan): the graph from the previous test case
+    #  (no inductor partition) gets cached by AotAutograd so then the
+    #  compilation with inductor partitioning incorrectly loads an unpartitioned
+    #  graph and never partitions. I think this is a bug with custom inductor
+    #  partitioning but does not affect vLLM more generally as vLLM uses its own
+    #  cache (which takes inductor partitioning into account).
+    if use_inductor_graph_partition:
+        compile_config_no_split.inductor_compile_config["force_disable_caches"] = True
+
+    compile_config_split = deepcopy(compile_config_no_split)
+    compile_config_split.splitting_ops = ["silly::attention"]
+
     outputs = []
     with compilation_counter.expect(
         num_graphs_seen=0,
@@ -358,8 +389,9 @@ def test_toy_llama(backend: str):
         num_backend_compilations=0,
         num_cudagraph_captured=0,
     ):
-        outputs.append(run_model(llama_config, backend="eager", use_compile=False))
-    run_model(tractable_config, backend="eager", use_compile=False)
+        outputs.append(run_model(llama_config, compile_config_no_compile))
+
+    run_model(tractable_config, compile_config_no_compile)
 
     if backend == "inductor":
         kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
@@ -367,35 +399,34 @@ def test_toy_llama(backend: str):
         kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
 
     with compilation_counter.expect(
-        # One graph for the model
-        num_graphs_seen=1,
+        num_graphs_seen=1,  # one graph for the model
         num_piecewise_graphs_seen=1,
         num_piecewise_capturable_graphs_seen=1,
-        # num_piecewise_capturable_graphs_seen
-        num_backend_compilations=1,
-        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
         num_cudagraph_captured=2,
         **kwargs,
     ):
-        outputs.append(run_model(llama_config, backend=backend, use_compile=True))
-    run_model(tractable_config, backend=backend, use_compile=True)
+        outputs.append(run_model(llama_config, compile_config_no_split))
+
+    run_model(tractable_config, compile_config_no_split)
+
+    if use_inductor_graph_partition:
+        num_piecewise_fx = 1
+        num_piecewise_capturable_fx = 1
+    else:
+        num_piecewise_fx = 2 * llama_config.num_layers + 1
+        num_piecewise_capturable_fx = 1 + llama_config.num_layers
 
     with compilation_counter.expect(
         num_graphs_seen=1,  # one graph for the model
-        num_piecewise_graphs_seen=2 * llama_config.num_layers + 1,  # 2 * num_layers + 1
-        num_piecewise_capturable_graphs_seen=1
-        + llama_config.num_layers,  # 1 + num_layers
-        num_backend_compilations=1
-        + llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
-        num_cudagraph_captured=2
-        * (
-            1 + llama_config.num_layers
-        ),  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_piecewise_graphs_seen=num_piecewise_fx,
+        num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+        num_backend_compilations=num_piecewise_capturable_fx,
+        # num_cudagraph_sizes * num_partitions
+        num_cudagraph_captured=2 * (1 + llama_config.num_layers),
     ):
-        outputs.append(
-            run_model(llama_config, backend=backend, use_compile=True, split_attn=True)
-        )
-    run_model(tractable_config, backend=backend, use_compile=True, split_attn=True)
+        outputs.append(run_model(llama_config, compile_config_split))
+    run_model(tractable_config, compile_config_split)
 
     for i in range(1, len(outputs)):
         assert torch.allclose(outputs[0], outputs[i])
diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py
index c0d3f908149f6..f33c5772906a6 100644
--- a/tests/compile/silly_attention.py
+++ b/tests/compile/silly_attention.py
@@ -62,5 +62,4 @@ direct_register_custom_op(
     mutates_args=["out"],
     fake_impl=silly_attention_fake,
     target_lib=silly_lib,
-    tags=(torch._C.Tag.cudagraph_unsafe,),
 )
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
index 6b050207ec41b..63cb266094a12 100644
--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -73,6 +73,7 @@ def test_ignore_torch_compile_decorator():
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
+            use_inductor_graph_partition=False,  # TODO test both?
         )
     )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -188,6 +189,7 @@ def test_conditional_compile_enable_if():
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
+            use_inductor_graph_partition=False,  # TODO test both
         ),
     )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -220,6 +222,7 @@ def test_conditional_compile_enable_if():
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
+            use_inductor_graph_partition=False,  # TODO test both?
         ),
     )
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index fe9de65b52c66..8b5b87cba4044 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -38,10 +38,6 @@ from vllm.utils import GiB_bytes, direct_register_custom_op
 
 logger = init_logger(__name__)
 USE_XFORMERS_OPS = None
-try:
-    tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe,)
-except AttributeError:
-    tag_cudagraph_unsafe = ()  # type: ignore[assignment]
 
 
 def check_xformers_availability():
@@ -879,7 +875,6 @@ direct_register_custom_op(
     op_name="unified_attention",
     op_func=unified_attention,
     fake_impl=unified_attention_fake,
-    tags=tag_cudagraph_unsafe,
 )
 
 
@@ -931,7 +926,6 @@ direct_register_custom_op(
     op_func=unified_attention_with_output,
     mutates_args=["output", "output_block_scale"],
     fake_impl=unified_attention_with_output_fake,
-    tags=tag_cudagraph_unsafe,
 )
 
 
diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
index 5ea1b30860f59..cea4f9a816377 100644
--- a/vllm/compilation/partition_rules.py
+++ b/vllm/compilation/partition_rules.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
+import logging
 from typing import TYPE_CHECKING
 
 from torch._library.utils import lookup_op
@@ -38,8 +39,16 @@ def resolve_defined_ops(op_names: list[str]) -> list["torch._ops.OpOverload"]:
             resolved.append(lookup_op(op_name))
         except Exception:
             # Skip operators that don't exist (e.g., model-specific ops)
-            logger.warning(
-                "Failed to resolve operator for Inductor partition: %s", op_name
+            # Do not warn for attention ops, warn for others
+            # (most likely manually specified)
+            from vllm.config import CompilationConfig
+
+            logger.log(
+                logging.DEBUG
+                if op_name in CompilationConfig._attention_ops
+                else logging.WARNING,
+                "Failed to resolve operator for CUDAGraph partition: %s",
+                op_name,
             )
             continue
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 60aef2f6f7e1c..fb80835ba48a1 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -201,7 +201,7 @@ class CompilationConfig:
     (it sees a part of the graph). The backend can not be custom for compilation
     level 3, i.e. the backend must be either eager or inductor. Furthermore,
     compilation is only piecewise if splitting ops is set accordingly and
-    use_inductor_cudagraphs_partition is off. Note that the default options for
+    use_inductor_graph_partition is off. Note that the default options for
     splitting ops are sufficient for piecewise compilation.
     """
     custom_ops: list[str] = field(default_factory=list)
@@ -431,6 +431,7 @@ class CompilationConfig:
         factors.append(self.custom_ops)
         factors.append(self.splitting_ops)
         factors.append(self.use_inductor)
+        factors.append(self.use_inductor_graph_partition)
         factors.append(self.inductor_compile_config)
         factors.append(self.inductor_passes)
         factors.append(self.pass_config.uuid())

From 07ca70af8d8a0d0e20727d8de6972a7ad87cf996 Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Tue, 14 Oct 2025 18:41:18 -0700
Subject: [PATCH 07/51] [Core][Easy] Use envs.__getattr__ for all Unify to
 environment variable access (#26810)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 vllm/multimodal/cache.py         | 6 +++---
 vllm/transformers_utils/utils.py | 4 ++--
 vllm/utils/gc_utils.py           | 6 +++---
 vllm/v1/engine/async_llm.py      | 5 ++---
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index f6ef675aa7c29..a29da2a56afc1 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -10,12 +10,12 @@ from typing import TYPE_CHECKING, Generic, TypeAlias, TypeVar, cast
 import torch
 from typing_extensions import override
 
+import vllm.envs as envs
 from vllm.distributed.device_communicators.shm_object_storage import (
     MsgpackSerde,
     SingleWriterShmObjectStorage,
     SingleWriterShmRingBuffer,
 )
-from vllm.envs import VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME
 from vllm.logger import init_logger
 from vllm.utils import GiB_bytes, MiB_bytes
 from vllm.utils.cache import CacheInfo, LRUCache
@@ -436,7 +436,7 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
 
         ring_buffer = SingleWriterShmRingBuffer(
             data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes),
-            name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME,
+            name=envs.VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME,
             create=True,  # sender is the writer
         )
         self._shm_cache = SingleWriterShmObjectStorage(
@@ -678,7 +678,7 @@ class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache):
 
         ring_buffer = SingleWriterShmRingBuffer(
             data_buffer_size=int(mm_config.mm_processor_cache_gb * GiB_bytes),
-            name=VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME,
+            name=envs.VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME,
             create=False,  # Server is a reader
         )
         self._shm_cache = SingleWriterShmObjectStorage(
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index b87414d79df0f..58c754dbd3974 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -8,7 +8,7 @@ from os import PathLike
 from pathlib import Path
 from typing import Any
 
-from vllm.envs import VLLM_MODEL_REDIRECT_PATH
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -86,7 +86,7 @@ def maybe_model_redirect(model: str) -> str:
     :return: maybe redirect to a local folder
     """
 
-    model_redirect_path = VLLM_MODEL_REDIRECT_PATH
+    model_redirect_path = envs.VLLM_MODEL_REDIRECT_PATH
 
     if not model_redirect_path:
         return model
diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index 99c19c9db28e9..6894ccff11d93 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -7,7 +7,7 @@ from collections import Counter
 from contextlib import suppress
 from typing import Any
 
-from vllm.envs import VLLM_GC_DEBUG
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -36,7 +36,7 @@ class GCDebugConfig:
                 self.top_objects = json_conf.get("top_objects", -1)
             except Exception:
                 self.enabled = False
-                logger.error("Failed to parse VLLM_GC_DEBUG(%s)", VLLM_GC_DEBUG)
+                logger.error("Failed to parse VLLM_GC_DEBUG(%s)", envs.VLLM_GC_DEBUG)
         logger.info("GC Debug Config. %s", str(self))
 
     def __repr__(self) -> str:
@@ -93,7 +93,7 @@ def maybe_attach_gc_debug_callback() -> None:
     """
     Attached a callback for GC debug when VLLM_GC_DEBUG is enabled.
     """
-    config = GCDebugConfig(VLLM_GC_DEBUG)
+    config = GCDebugConfig(envs.VLLM_GC_DEBUG)
     if config.enabled:
         debugger: GCDebugger = GCDebugger(config)
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 39cd1d97c280a..0ec153e233161 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -16,7 +16,6 @@ from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.utils import _validate_truncation_size
-from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -483,12 +482,12 @@ class AsyncLLM(EngineClient):
                     # Split outputs into chunks of at most
                     # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
                     # event loop for too long.
-                    if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
+                    if num_outputs <= envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
                         slices = (outputs.outputs,)
                     else:
                         slices = np.array_split(
                             outputs.outputs,
-                            cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE),
+                            cdiv(num_outputs, envs.VLLM_V1_OUTPUT_PROC_CHUNK_SIZE),
                         )
 
                     for i, outputs_slice in enumerate(slices):

From 9354660036dff11a81433f0695c71dfee75cce50 Mon Sep 17 00:00:00 2001
From: Zhikaiiii <55917203+Zhikaiiii@users.noreply.github.com>
Date: Wed, 15 Oct 2025 09:50:30 +0800
Subject: [PATCH 08/51] [Bugfix]fix Qwen3 xml tool parser (#26345)

Signed-off-by: Zhikaiiii <1658973216@qq.com>
---
 tests/tool_use/test_qwen3coder_tool_parser.py |  88 ++++++++++++-
 .../tool_parsers/qwen3xml_tool_parser.py      | 117 ++++++++++++++----
 2 files changed, 179 insertions(+), 26 deletions(-)

diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index b4f0989b1b19c..93ef1049fc07e 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -40,7 +40,7 @@ def qwen3_xml_tool_parser(qwen3_tokenizer):
     return Qwen3XMLToolParser(qwen3_tokenizer)
 
 
-@pytest.fixture(params=["original", "xml"])
+@pytest.fixture(params=["xml"])
 def qwen3_tool_parser_parametrized(qwen3_tool_parser, qwen3_xml_tool_parser, request):
     """Parameterized fixture that provides both parser types for testing"""
     if request.param == "original":
@@ -664,6 +664,9 @@ def test_extract_tool_calls_streaming(
 
     # Verify we got all expected tool calls
     assert len(tool_states) == len(expected_tool_calls)
+    assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == len(
+        expected_tool_calls
+    )
 
     # Verify each tool call
     for idx, expected_tool in enumerate(expected_tool_calls):
@@ -780,9 +783,10 @@ fahrenheit
 
     # Verify content was streamed
     assert "Let me check the weather for you:" in other_content
-
     # Verify we got the tool call
     assert len(tool_states) == 1
+    assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1
+
     state = tool_states[0]
     assert state["id"] is not None
     assert state["type"] == "function"
@@ -892,3 +896,83 @@ def test_extract_tool_calls_complex_type_with_single_quote(
 
     args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments)
     assert args["obj_param"] == {"key": "value"}
+
+
+def test_extract_tool_calls_streaming_missing_opening_tag(
+    qwen3_tool_parser_parametrized, qwen3_tokenizer, sample_tools
+):
+    """Test streaming with missing opening <tool_call> tag
+
+    This tests that the streaming parser correctly handles
+    tool calls that start directly with <function=...>
+    """
+    model_output = """I'll check the weather for you.
+
+<function=get_current_weather>
+<parameter=city>
+Dallas
+</parameter>
+<parameter=state>
+TX
+</parameter>
+<parameter=unit>
+fahrenheit
+</parameter>
+</function>
+</tool_call>"""
+
+    request = ChatCompletionRequest(model=MODEL, messages=[], tools=sample_tools)
+
+    other_content = ""
+    tool_states = {}
+
+    for delta_message in stream_delta_message_generator(
+        qwen3_tool_parser_parametrized, qwen3_tokenizer, model_output, request
+    ):
+        if delta_message.content:
+            other_content += delta_message.content
+
+        if delta_message.tool_calls:
+            for tool_call in delta_message.tool_calls:
+                idx = tool_call.index
+
+                if idx not in tool_states:
+                    tool_states[idx] = {
+                        "id": None,
+                        "name": None,
+                        "arguments": "",
+                        "type": None,
+                    }
+
+                if tool_call.id:
+                    tool_states[idx]["id"] = tool_call.id
+
+                if tool_call.type:
+                    assert tool_call.type == "function"
+                    tool_states[idx]["type"] = tool_call.type
+
+                if tool_call.function:
+                    if tool_call.function.name:
+                        tool_states[idx]["name"] = tool_call.function.name
+
+                    if tool_call.function.arguments is not None:
+                        tool_states[idx]["arguments"] += tool_call.function.arguments
+
+    # Verify content was streamed
+    assert "I'll check the weather for you." in other_content
+
+    # Verify we got the tool call
+    assert len(tool_states) == 1
+    assert len(qwen3_tool_parser_parametrized.prev_tool_call_arr) == 1
+
+    state = tool_states[0]
+    assert state["id"] is not None
+    assert state["type"] == "function"
+    assert state["name"] == "get_current_weather"
+
+    # Verify arguments were parsed correctly despite missing opening tag
+    assert state["arguments"] is not None
+    args = json.loads(state["arguments"])
+    assert args["city"] == "Dallas"
+    assert args["state"] == "TX"
+    assert args["unit"] == "fahrenheit"
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
index 2c5b0b6a85f76..9964d1ac25c40 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
@@ -2,13 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import ast
 import json
-import uuid
 from collections.abc import Sequence
 from typing import Any
 from xml.parsers.expat import ParserCreate
 
 import regex as re
 
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionToolsParam,
@@ -375,14 +375,21 @@ class StreamingXMLToolCallParser:
                 return buffer[: tag_end2 + 1], start_pos + tag_end2 + 1
             else:
                 # If currently not parsing tool calls (entering a tool_call),
-                # check if starts with <tool_call>
+                # check if starts with <tool_call> or <function=
                 if self.current_call_id is None:
                     # Check if might be start of <tool_call>
                     if buffer == "<tool_call>"[: len(buffer)]:
                         # Might be start of <tool_call>, wait for more data
                         return None, start_pos
+                    elif (
+                        buffer.startswith("<function=")
+                        or buffer == "<function="[: len(buffer)]
+                    ):
+                        # Might be start of <function=, wait for more data
+                        # to get the complete function tag
+                        return None, start_pos
                     else:
-                        # Not start of <tool_call>, treat as text
+                        # Not start of <tool_call> or <function=, treat as text
                         return buffer, start_pos + len(buffer)
                 else:
                     # When parsing tool calls,
@@ -621,7 +628,7 @@ class StreamingXMLToolCallParser:
             self._auto_close_open_parameter_if_needed("tool_call")
 
             self.parameters = {}
-            self.current_call_id = self._get_next_call_id()
+            self.current_call_id = make_tool_call_id()
             self.current_param_is_first = True
             self.tool_call_index += 1
         elif name.startswith("function") or (name == "function"):
@@ -957,10 +964,6 @@ class StreamingXMLToolCallParser:
         """Set tool configuration information"""
         self.tools = tools
 
-    def _get_next_call_id(self):
-        """Generate unique call ID"""
-        return f"call_{uuid.uuid4().hex[:24]}"
-
     def _extract_function_name(self, name: str, attrs: dict[str, str]) -> str | None:
         """Extract function name from various formats"""
         if attrs and "name" in attrs:
@@ -1168,6 +1171,10 @@ class Qwen3XMLToolParser(ToolParser):
         super().__init__(tokenizer)
         self.parser = StreamingXMLToolCallParser()
 
+        # Add missing attributes for compatibility with serving_chat.py
+        self.prev_tool_call_arr: list[dict] = []
+        self.streamed_args_for_tool: list[str] = []
+
         logger.info(
             "vLLM Successfully import tool parser %s !", self.__class__.__name__
         )
@@ -1178,6 +1185,9 @@ class Qwen3XMLToolParser(ToolParser):
         request: ChatCompletionRequest,
     ) -> ExtractedToolCallInformation:
         self.parser.reset_streaming_state()
+        # Reset tool call tracking arrays for new extraction
+        self.prev_tool_call_arr = []
+        self.streamed_args_for_tool = []
         if request:
             self.parser.set_tools(request.tools)
         result = self.parser.parse_single_streaming_chunks(model_output)
@@ -1201,6 +1211,34 @@ class Qwen3XMLToolParser(ToolParser):
                             ),
                         )
                     )
+
+                    # Update tool call tracking arrays for compatibility
+                    tool_index = (
+                        tool_call.index
+                        if tool_call.index is not None
+                        else len(self.prev_tool_call_arr) - 1
+                    )
+
+                    # Ensure we have enough entries in our tracking arrays
+                    while len(self.prev_tool_call_arr) <= tool_index:
+                        self.prev_tool_call_arr.append({"name": "", "arguments": ""})
+                    while len(self.streamed_args_for_tool) <= tool_index:
+                        self.streamed_args_for_tool.append("")
+
+                    # Update tool call information
+                    self.prev_tool_call_arr[tool_index]["name"] = (
+                        tool_call.function.name
+                    )
+                    self.prev_tool_call_arr[tool_index]["arguments"] = (
+                        tool_call.function.arguments
+                    )
+
+                    # Update streamed arguments
+                    if tool_call.function.arguments:
+                        self.streamed_args_for_tool[tool_index] = (
+                            tool_call.function.arguments
+                        )
+
             return ExtractedToolCallInformation(
                 tool_calls=tool_calls,
                 tools_called=len(tool_calls) > 0,
@@ -1219,6 +1257,9 @@ class Qwen3XMLToolParser(ToolParser):
     ) -> DeltaMessage | None:
         if not previous_text:
             self.parser.reset_streaming_state()
+            # Reset tool call tracking arrays for new streaming session
+            self.prev_tool_call_arr = []
+            self.streamed_args_for_tool = []
             if request:
                 self.parser.set_tools(request.tools)
 
@@ -1230,20 +1271,48 @@ class Qwen3XMLToolParser(ToolParser):
             open_calls = current_text.count(
                 self.parser.tool_call_start_token
             ) - current_text.count(self.parser.tool_call_end_token)
-            if open_calls == 0 and self.parser.tool_call_index > 0:
-                # If current_call_id is None, use last_completed_call_id
-                call_id = (
-                    self.parser.current_call_id or self.parser.last_completed_call_id
-                )
-                return DeltaMessage(
-                    tool_calls=[
-                        DeltaToolCall(
-                            index=self.parser.tool_call_index - 1,
-                            id=call_id,
-                            function=DeltaFunctionCall(arguments=""),
-                            type="function",
-                        )
-                    ]
-                )
+            if (
+                open_calls == 0
+                and self.parser.tool_call_index > 0
+                or not self.parser.tool_call_index
+                and current_text
+            ):
+                return DeltaMessage(content="")
+            return None
 
-        return self.parser.parse_single_streaming_chunks(delta_text)
+        # Parse the delta text and get the result
+        result = self.parser.parse_single_streaming_chunks(delta_text)
+
+        # Update tool call tracking arrays based on incremental parsing results
+        if result and result.tool_calls:
+            for tool_call in result.tool_calls:
+                if tool_call.function:
+                    tool_index = (
+                        tool_call.index
+                        if tool_call.index is not None
+                        else len(self.prev_tool_call_arr) - 1
+                    )
+
+                    # Ensure we have enough entries in our tracking arrays
+                    while len(self.prev_tool_call_arr) <= tool_index:
+                        self.prev_tool_call_arr.append({"name": "", "arguments": ""})
+                    while len(self.streamed_args_for_tool) <= tool_index:
+                        self.streamed_args_for_tool.append("")
+
+                    # Update tool name if provided
+                    if tool_call.function.name:
+                        self.prev_tool_call_arr[tool_index]["name"] = (
+                            tool_call.function.name
+                        )
+
+                    # Update arguments incrementally
+                    if tool_call.function.arguments is not None:
+                        # Concatenate the incremental arguments
+                        # to the existing streamed arguments
+                        self.prev_tool_call_arr[tool_index]["arguments"] += (
+                            tool_call.function.arguments
+                        )
+                        self.streamed_args_for_tool[tool_index] += (
+                            tool_call.function.arguments
+                        )
+        return result

From bfad142e257be6699868f7816ca64c408bc32916 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Tue, 14 Oct 2025 21:33:25 -0500
Subject: [PATCH 09/51] [BUGFIX][NIXL] quick fix for 'assert
 self.connector_worker is not None' in get_kv_connector_stats (#26851)

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 490f209373db3..6a2434ddce8be 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -241,7 +241,8 @@ class NixlConnector(KVConnectorBase_V1):
         return self.connector_worker.get_block_ids_with_load_errors()
 
     def get_kv_connector_stats(self) -> KVConnectorStats | None:
-        assert self.connector_worker is not None
+        if self.connector_worker is None:
+            return None
         return self.connector_worker.get_kv_connector_stats()
 
     @classmethod

From e66d787bce22c56f995f4e2974e31ac020bc57ea Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 14 Oct 2025 22:35:18 -0400
Subject: [PATCH 10/51] Disable FlashInfer sampler by default (#26859)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/v1/sample/ops/topk_topp_sampler.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index ed8bc55a3cf2f..43a40bce6847d 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -46,23 +46,15 @@ class TopKTopPSampler(nn.Module):
                         "Falling back to default sampling implementation."
                     )
                     self.forward = self.forward_native
-                elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
-                    # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
-                    # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
-                    # default it is unused). For backward compatibility, we set
-                    # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
-                    # interpret it differently in V0 and V1 samplers: In V0,
-                    # None means False, while in V1, None means True. This is
-                    # why we use the condition
-                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
+                elif envs.VLLM_USE_FLASHINFER_SAMPLER:
+                    # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
                     logger.info_once("Using FlashInfer for top-p & top-k sampling.")
                     self.forward = self.forward_cuda
                 else:
-                    logger.warning_once(
-                        "FlashInfer is available, but it is not enabled. "
-                        "Falling back to the PyTorch-native implementation of "
-                        "top-p & top-k sampling. For the best performance, "
-                        "please set VLLM_USE_FLASHINFER_SAMPLER=1."
+                    logger.debug_once(
+                        "FlashInfer top-p/top-k sampling is available but disabled "
+                        "by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in "
+                        "after verifying accuracy for your workloads."
                     )
                     self.forward = self.forward_native
             else:

From 96b9aa5aa076e64c68765232aec343e4d0006e2a Mon Sep 17 00:00:00 2001
From: Morrison Turnansky <mturnans@redhat.com>
Date: Tue, 14 Oct 2025 22:51:16 -0400
Subject: [PATCH 11/51] [Frontend][torch.compile] CompilationConfig Overhaul
 (#20283): name change  compilation level to compilation mode, deprecation
 compilation level (#26355)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 docs/configuration/conserving_memory.md       |   4 +-
 docs/design/cuda_graphs.md                    |   4 +-
 examples/offline_inference/data_parallel.py   |   2 +-
 .../compile/piecewise/test_multiple_graphs.py |  10 +-
 tests/compile/piecewise/test_simple.py        |   4 +-
 tests/compile/piecewise/test_toy_llama.py     |  10 +-
 tests/compile/test_aot_compile.py             |   4 +-
 tests/compile/test_async_tp.py                |   3 +-
 tests/compile/test_basic_correctness.py       |  30 +++--
 tests/compile/test_config.py                  |  20 ++--
 tests/compile/test_decorator.py               |  10 +-
 tests/compile/test_full_graph.py              |  29 ++---
 tests/compile/test_fusion.py                  |   4 +-
 tests/compile/test_fusion_all_reduce.py       |   4 +-
 tests/compile/test_fusion_attn.py             |   4 +-
 tests/compile/test_noop_elimination.py        |   6 +-
 tests/compile/test_wrapper.py                 |   4 +-
 tests/distributed/test_sequence_parallel.py   |   3 +-
 tests/engine/test_arg_utils.py                |  20 ++--
 tests/tpu/test_custom_dispatcher.py           |   6 +-
 tests/utils_/test_utils.py                    |  10 +-
 tests/v1/cudagraph/test_cudagraph_dispatch.py |  22 ++--
 tests/v1/cudagraph/test_cudagraph_mode.py     |  39 +++----
 tests/v1/e2e/test_kv_sharing_fast_prefill.py  |   6 +-
 vllm/compilation/backends.py                  |   4 +-
 vllm/compilation/compiler_interface.py        |   2 +-
 vllm/compilation/counter.py                   |   4 +-
 vllm/compilation/decorators.py                |  10 +-
 vllm/compilation/monitor.py                   |   6 +-
 vllm/compilation/wrapper.py                   |   8 +-
 vllm/config/__init__.py                       |   4 +-
 vllm/config/compilation.py                    | 106 ++++++++++++------
 vllm/config/vllm.py                           |  50 ++++-----
 vllm/entrypoints/llm.py                       |   6 +-
 .../layers/quantization/utils/w8a8_utils.py   |   4 +-
 vllm/platforms/cpu.py                         |   8 +-
 vllm/platforms/tpu.py                         |  11 +-
 vllm/platforms/xpu.py                         |   4 +-
 vllm/utils/__init__.py                        |  10 +-
 vllm/v1/cudagraph_dispatcher.py               |   4 +-
 vllm/v1/spec_decode/eagle.py                  |   4 +-
 vllm/v1/worker/gpu_model_runner.py            |  15 +--
 42 files changed, 270 insertions(+), 248 deletions(-)

diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 2b0654fa6d463..85906d23dee33 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -58,12 +58,12 @@ You can adjust `compilation_config` to achieve a better balance between inferenc
 
     ```python
     from vllm import LLM
-    from vllm.config import CompilationConfig, CompilationLevel
+    from vllm.config import CompilationConfig, CompilationMode
 
     llm = LLM(
         model="meta-llama/Llama-3.1-8B-Instruct",
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             # By default, it goes up to max_num_seqs
             cudagraph_capture_sizes=[1, 2, 4, 8, 16],
         ),
diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
index 315746b0ef674..c6d71589be985 100644
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -167,7 +167,7 @@ class AttentionCGSupport(enum.Enum):
     """NO CUDA Graphs support"""
 ```
 
-Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation level. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
+Suppose we have hybrid attention backends (e.g., in mamba mixer models). In that case, we seek the minimum capability of all backends to determine the final capability of the model, and we might resolve the incompatible CUDA Graphs mode by downgrading the mode to the best fit one. For example, downgrading `FULL` mode to `FULL_AND_PIECEWISE` mode if the minimum capability is `UNIFORM_BATCH`, or `PIECEWISE` mode if the minimum capability is `NEVER` for -O3 compilation mode. For the complete fallback policy, please see the code of [initialize_cudagraph_capture][vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_cudagraph_capture].
 
 The following table lists backends that support full CUDA Graphs at the time of writing.
 
@@ -202,7 +202,7 @@ os.environ.setdefault("VLLM_LOGGING_LEVEL", "DEBUG")
 import vllm
 from vllm.config import CUDAGraphMode
 
-compilation_config = {"level": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
+compilation_config = {"mode": 3, "cudagraph_mode": "FULL_AND_PIECEWISE"}
 model = vllm.LLM(
     model="meta-llama/Llama-3.1-8B-Instruct",
     dtype="auto",
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 0076d4d30ee8e..a3e671a0f4cca 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -95,7 +95,7 @@ def parse_args():
     parser.add_argument(
         "--compilation-config",
         type=int,
-        help=("Compilation optimization (O) level 0-3."),
+        help=("Compilation optimization (O) mode 0-3."),
     )
     parser.add_argument(
         "--quantization",
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
index 0d265bc596386..d1f741479acf4 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -14,7 +14,7 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import ignore_torch_compile, support_torch_compile
 from vllm.config import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     VllmConfig,
     set_current_vllm_config,
@@ -199,10 +199,10 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
 
     outputs = []
 
-    # piecewise compile
+    # vllmcompile compile
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
@@ -251,7 +251,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
     # no compile or cudagraph
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.NO_COMPILATION,
+            mode=CompilationMode.NONE,
         )
     )
     cudagraph_runtime_mode = CUDAGraphMode.NONE
@@ -280,7 +280,7 @@ def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
     # piecewise compile without CUDA graph
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=False,
             splitting_ops=["silly::attention"],
             use_inductor_graph_partition=use_inductor_graph_partition,
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index bc65e3da0ae74..f61a0a4eb740d 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -13,7 +13,7 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     VllmConfig,
     set_current_vllm_config,
@@ -61,7 +61,7 @@ def _run_simple_model(
 ):
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             use_inductor=use_inductor,
             splitting_ops=splitting_ops,
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 7ab610fa78115..75a89d692fa8f 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -21,7 +21,7 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     VllmConfig,
     set_current_vllm_config,
@@ -356,13 +356,13 @@ def test_toy_llama(
     )
 
     compile_config_no_compile = CompilationConfig(
-        level=CompilationLevel.NO_COMPILATION,
+        level=CompilationMode.NONE,
         cudagraph_mode=CUDAGraphMode.NONE,
         backend="eager",
     )
 
     compile_config_no_split = CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
+        level=CompilationMode.VLLM_COMPILE,
         use_inductor_graph_partition=use_inductor_graph_partition,
         cudagraph_mode=CUDAGraphMode.PIECEWISE,
         backend=backend,
@@ -458,14 +458,14 @@ def benchmark():
     for piecewise in [False, True]:
         if piecewise:
             compilation_config = CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                 use_cudagraph=True,
                 splitting_ops=["silly::attention"],
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
         else:
             compilation_config = CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                 cudagraph_capture_sizes=cudagraph_sizes,
             )
 
diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index 08f79d90cd367..1701d85fe84e7 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -10,7 +10,7 @@ import torch
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     VllmConfig,
     set_current_vllm_config,
 )
@@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module):
 def make_vllm_config() -> VllmConfig:
     return VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
         )
     )
 
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 102a929bf2409..60856f5a58067 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -10,6 +10,7 @@ import vllm.envs as envs
 from vllm.compilation.collective_fusion import AsyncTPPass
 from vllm.config import (
     CompilationConfig,
+    CompilationMode,
     DeviceConfig,
     ModelConfig,
     PassConfig,
@@ -400,7 +401,7 @@ def test_async_tp_pass_correctness(
         common_args.append("--enforce-eager")
 
     compilation_config = {
-        "level": 3,
+        "mode": CompilationMode.VLLM_COMPILE,
         "compile_sizes": [2, 4, 8],
         "splitting_ops": [],
         "pass_config": {"enable_async_tp": async_tp_enabled},
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index ab6a17e149fcd..954774a8e3983 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -4,7 +4,7 @@ import dataclasses
 
 import pytest
 
-from vllm.config import CompilationLevel
+from vllm.config import CompilationMode
 from vllm.utils import cuda_device_count_stateless
 
 from ..utils import compare_all_settings
@@ -21,7 +21,7 @@ class TestSetting:
 
 
 # we cannot afford testing the full Cartesian product
-# of all models and all levels
+# of all models and all modes
 @pytest.mark.parametrize(
     "test_setting",
     [
@@ -121,15 +121,13 @@ def test_compile_correctness(
         all_args: list[list[str]] = []
         all_envs: list[dict[str, str] | None] = []
 
-        for comp_level in [
-            CompilationLevel.DYNAMO_AS_IS,
-            CompilationLevel.DYNAMO_ONCE,
-            CompilationLevel.PIECEWISE,
+        for comp_mode in [
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+            CompilationMode.VLLM_COMPILE,
         ]:
-            for level in [CompilationLevel.NO_COMPILATION, comp_level]:
-                all_args.append(
-                    final_args + [f"-O.level={level}", "-O.backend=inductor"]
-                )
+            for mode in [CompilationMode.NONE, comp_mode]:
+                all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=inductor"])
 
             # inductor will change the output, so we only compare if the output
             # is close, not exactly the same.
@@ -142,13 +140,13 @@ def test_compile_correctness(
             all_envs.clear()
             all_args.clear()
 
-        for level in [
-            CompilationLevel.NO_COMPILATION,
-            CompilationLevel.DYNAMO_AS_IS,
-            CompilationLevel.DYNAMO_ONCE,
-            CompilationLevel.PIECEWISE,
+        for mode in [
+            CompilationMode.NONE,
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+            CompilationMode.VLLM_COMPILE,
         ]:
-            all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
+            all_args.append(final_args + [f"-O.mode={mode}", "-O.backend=eager"])
             all_envs.append({})
             all_envs.append({})
 
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index ae8b0b226c313..7f51c763da73c 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -4,7 +4,7 @@ import pytest
 
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
-from vllm.config.compilation import CompilationLevel
+from vllm.config.compilation import CompilationMode
 from vllm.utils import _is_torch_equal_or_newer, is_torch_equal_or_newer
 
 
@@ -90,16 +90,16 @@ def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
 
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
 @pytest.mark.forked
-def test_dynamo_as_is(vllm_runner, monkeypatch):
+def test_stock_torch_compile(vllm_runner, monkeypatch):
     # Disable multiprocessing so that the counter is in the same process
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     with (
-        compilation_counter.expect(dynamo_as_is_count=1),
+        compilation_counter.expect(stock_torch_compile_count=1),
         # loading the model causes compilation (if enabled) to happen
         vllm_runner(
             "facebook/opt-125m",
-            compilation_config={"level": 1},
+            compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
             gpu_memory_utilization=0.4,
         ) as _,
     ):
@@ -112,11 +112,11 @@ def test_no_compilation(vllm_runner, monkeypatch):
     # Disable multiprocessing so that the counter is in the same process
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
     with (
-        compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
+        compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
         # loading the model causes compilation (if enabled) to happen
         vllm_runner(
             "facebook/opt-125m",
-            compilation_config={"level": 0},
+            compilation_config={"mode": CompilationMode.NONE},
             gpu_memory_utilization=0.4,
         ) as _,
     ):
@@ -130,7 +130,7 @@ def test_enforce_eager(vllm_runner, monkeypatch):
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
     with (
-        compilation_counter.expect(num_graphs_seen=0, dynamo_as_is_count=0),
+        compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
         # loading the model causes compilation (if enabled) to happen
         vllm_runner(
             "facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
@@ -151,7 +151,7 @@ def test_splitting_ops_dynamic():
     if is_torch_equal_or_newer("2.9.0.dev"):
         config = VllmConfig(
             compilation_config=CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                level=CompilationMode.VLLM_COMPILE,
                 use_inductor_graph_partition=True,
                 splitting_ops=["vllm::unified_attention"],
             )
@@ -163,7 +163,7 @@ def test_splitting_ops_dynamic():
     # When attn_fusion pass enabled, splitting_ops now default to attention ops.
     config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            level=CompilationMode.VLLM_COMPILE,
             pass_config={"enable_attn_fusion": True, "enable_noop": True},
             custom_ops=["+quant_fp8"],
             cudagraph_mode=CUDAGraphMode.PIECEWISE,
@@ -178,7 +178,7 @@ def test_splitting_ops_dynamic():
     if is_torch_equal_or_newer("2.9.0.dev"):
         config = VllmConfig(
             compilation_config=CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                level=CompilationMode.VLLM_COMPILE,
                 use_inductor_graph_partition=True,
                 pass_config={"enable_attn_fusion": True, "enable_noop": True},
                 custom_ops=["+quant_fp8"],
diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
index 63cb266094a12..4d60899a628a9 100644
--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -8,7 +8,7 @@ from vllm.compilation.decorators import ignore_torch_compile, support_torch_comp
 from vllm.config import (
     CacheConfig,
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     VllmConfig,
     set_current_vllm_config,
@@ -66,10 +66,10 @@ def run_model(
 
 
 def test_ignore_torch_compile_decorator():
-    # piecewise
+    # vllmcompile
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
@@ -185,7 +185,7 @@ def test_conditional_compile_enable_if():
             kv_sharing_fast_prefill=True,
         ),
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
@@ -218,7 +218,7 @@ def test_conditional_compile_enable_if():
             kv_sharing_fast_prefill=False,
         ),
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 2f3794c90b204..2d290771f9ad7 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -12,7 +12,7 @@ from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 from vllm.attention.backends.registry import _Backend
 from vllm.attention.selector import global_force_attn_backend_context_manager
-from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils import is_torch_equal_or_newer
 
@@ -80,22 +80,22 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
 
 
 @pytest.mark.parametrize(
-    "optimization_level",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+    "compilation_mode",
+    [CompilationMode.DYNAMO_TRACE_ONCE, CompilationMode.VLLM_COMPILE],
 )
 @pytest.mark.parametrize("model_info", models_list(all=True))
 @create_new_process_for_each_test()
 def test_full_graph(
     monkeypatch: pytest.MonkeyPatch,
     model_info: tuple[str, dict[str, Any]],
-    optimization_level: int,
+    compilation_mode: int,
 ):
     model, model_kwargs = model_info
 
     with monkeypatch.context():
         print(f"MODEL={model}")
 
-        run_model(optimization_level, model, model_kwargs)
+        run_model(compilation_mode, model, model_kwargs)
 
 
 # TODO(luka) add other supported compilation config scenarios here
@@ -104,7 +104,7 @@ def test_full_graph(
     [
         # additional compile sizes, only some of the models
         (
-            CompilationConfig(level=CompilationLevel.PIECEWISE, compile_sizes=[1, 2]),
+            CompilationConfig(mode=CompilationMode.VLLM_COMPILE, compile_sizes=[1, 2]),
             model,
         )
         for model in models_list(all=False)
@@ -113,7 +113,7 @@ def test_full_graph(
         # RMSNorm + quant fusion, only 8-bit quant models
         (
             CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                 custom_ops=["+rms_norm"],
                 pass_config=PassConfig(enable_fusion=True, enable_noop=True),
             ),
@@ -125,7 +125,8 @@ def test_full_graph(
         # Test depyf integration works
         (
             CompilationConfig(
-                level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir()
+                mode=CompilationMode.VLLM_COMPILE,
+                debug_dump_path=tempfile.gettempdir(),
             ),
             ("facebook/opt-125m", {}),
         ),
@@ -134,7 +135,7 @@ def test_full_graph(
         # graph inductor partition
         (
             CompilationConfig(
-                level=CompilationLevel.PIECEWISE,
+                mode=CompilationMode.VLLM_COMPILE,
                 # inductor graph partition uses
                 # torch._C.Tag.cudagraph_unsafe to specify splitting ops
                 use_inductor_graph_partition=True,
@@ -164,10 +165,10 @@ def test_custom_compile_config(
 
 
 @pytest.mark.parametrize(
-    "optimization_level",
-    [CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE],
+    "compilation_mode",
+    [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
 )
-def test_fp8_kv_scale_compile(optimization_level: int):
+def test_fp8_kv_scale_compile(compilation_mode: int):
     model = "Qwen/Qwen2-0.5B"
     model_kwargs = {
         "quantization": "fp8",
@@ -175,7 +176,7 @@ def test_fp8_kv_scale_compile(optimization_level: int):
         "calculate_kv_scales": True,
         "max_model_len": 512,
     }
-    run_model(optimization_level, model, model_kwargs)
+    run_model(compilation_mode, model, model_kwargs)
 
 
 def test_inductor_graph_partition_attn_fusion(caplog_vllm):
@@ -184,7 +185,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
 
     model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
     compilation_config = CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
+        mode=CompilationMode.VLLM_COMPILE,
         use_inductor_graph_partition=True,
         cudagraph_mode=CUDAGraphMode.PIECEWISE,
         custom_ops=["+quant_fp8"],
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 7c22336432299..1a5eaf2639b36 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -13,7 +13,7 @@ from vllm.compilation.fusion import (
 )
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
-from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig
+from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
@@ -114,7 +114,7 @@ def test_fusion_rmsnorm_quant(
 
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             custom_ops=["+rms_norm", "+quant_fp8"],
             pass_config=PassConfig(enable_fusion=True, enable_noop=True),
         )
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
index 455d1bb039057..fbcd6c71fb723 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -12,7 +12,7 @@ from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     DeviceConfig,
     ModelConfig,
     PassConfig,
@@ -219,7 +219,7 @@ def all_reduce_fusion_pass_on_test_model(
 
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm", "+quant_fp8"]
+            mode=CompilationMode.VLLM_COMPILE, custom_ops=["+rms_norm", "+quant_fp8"]
         )
     )
     vllm_config.compilation_config.pass_config = PassConfig(
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index d1ab85cfb875c..a8d78daa32a1d 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -19,7 +19,7 @@ from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.config import (
     CacheConfig,
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     ModelConfig,
     PassConfig,
     SchedulerConfig,
@@ -321,7 +321,7 @@ def test_attention_quant_pattern(
         ),
         scheduler_config=SchedulerConfig(max_num_seqs=1024),
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             custom_ops=["+quant_fp8"],
             use_inductor_graph_partition=use_inductor_graph_partition,
         ),
diff --git a/tests/compile/test_noop_elimination.py b/tests/compile/test_noop_elimination.py
index 188f4514dda5f..0ccc1a0161629 100644
--- a/tests/compile/test_noop_elimination.py
+++ b/tests/compile/test_noop_elimination.py
@@ -6,7 +6,7 @@ import torch
 
 import vllm
 from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.config import CompilationConfig, CompilationLevel, PassConfig, VllmConfig
+from vllm.config import CompilationConfig, CompilationMode, PassConfig, VllmConfig
 
 from .backend import TestBackend
 
@@ -50,7 +50,7 @@ def test_noop_elimination(dtype, num_tokens, hidden_size, buffer_size):
 
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             pass_config=PassConfig(enable_noop=True),
         )
     )
@@ -98,7 +98,7 @@ def test_non_noop_slice_preserved():
 
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             pass_config=PassConfig(enable_noop=True),
         )
     )
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index b2fff822bbbb5..da0afd9eaa49f 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -5,7 +5,7 @@
 import torch
 
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationLevel
+from vllm.config import CompilationMode
 
 
 class MyMod(torch.nn.Module):
@@ -20,7 +20,7 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
         self.model = model
         compiled_callable = torch.compile(self.forward, backend="eager")
         super().__init__(
-            compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
+            compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
         )
 
     def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index a431bf30fc890..362e9daf5ae04 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -15,6 +15,7 @@ from typing import Literal, NamedTuple
 
 import pytest
 
+from vllm.config.compilation import CompilationMode
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
 
@@ -234,7 +235,7 @@ def _compare_sp(
         common_args.append("--skip-tokenizer-init")
 
     compilation_config = {
-        "level": 3,
+        "mode": CompilationMode.VLLM_COMPILE,
         "custom_ops": ["+rms_norm"],
         "compile_sizes": [4, 8],
         "pass_config": {
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 78928a53942f9..c73083b0b5ef6 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -226,30 +226,30 @@ def test_compilation_config():
 
     # set to O3
     args = parser.parse_args(["-O0"])
-    assert args.compilation_config.level == 0
+    assert args.compilation_config.mode == 0
 
     # set to O 3 (space)
     args = parser.parse_args(["-O", "1"])
-    assert args.compilation_config.level == 1
+    assert args.compilation_config.mode == 1
 
     # set to O 3 (equals)
     args = parser.parse_args(["-O=2"])
-    assert args.compilation_config.level == 2
+    assert args.compilation_config.mode == 2
 
-    # set to O.level 3
-    args = parser.parse_args(["-O.level", "3"])
-    assert args.compilation_config.level == 3
+    # set to O.mode 3
+    args = parser.parse_args(["-O.mode", "3"])
+    assert args.compilation_config.mode == 3
 
     # set to string form of a dict
     args = parser.parse_args(
         [
             "-O",
-            '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
             '"use_inductor": false}',
         ]
     )
     assert (
-        args.compilation_config.level == 3
+        args.compilation_config.mode == 3
         and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
         and not args.compilation_config.use_inductor
     )
@@ -258,12 +258,12 @@ def test_compilation_config():
     args = parser.parse_args(
         [
             "--compilation-config="
-            '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
             '"use_inductor": true}',
         ]
     )
     assert (
-        args.compilation_config.level == 3
+        args.compilation_config.mode == 3
         and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
         and args.compilation_config.use_inductor
     )
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index 102e5ddf16d6d..cf455ff3edbd3 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.config import CompilationLevel
+from vllm.config import CompilationMode
 
 from ..utils import compare_two_settings
 
@@ -21,13 +21,13 @@ def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
                 "--max-model-len=256",
                 "--max-num-seqs=32",
                 "--enforce-eager",
-                f"-O{CompilationLevel.DYNAMO_ONCE}",
+                f"-O{CompilationMode.DYNAMO_TRACE_ONCE}",
             ],
             arg2=[
                 "--max-model-len=256",
                 "--max-num-seqs=32",
                 "--enforce-eager",
-                f"-O{CompilationLevel.DYNAMO_AS_IS}",
+                f"-O{CompilationMode.STOCK_TORCH_COMPILE}",
             ],
             env1={},
             env2={},
diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index 308629ab05834..af5fc758f2c26 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -299,7 +299,7 @@ def test_dict_args(parser):
         "val2",
         "--hf-overrides.key2.key4",
         "val3",
-        # Test compile config and compilation level
+        # Test compile config and compilation mode
         "-O.use_inductor=true",
         "-O.backend",
         "custom",
@@ -352,7 +352,7 @@ def test_dict_args(parser):
         },
     }
     assert parsed_args.compilation_config == {
-        "level": 1,
+        "mode": 1,
         "use_inductor": True,
         "backend": "custom",
         "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
@@ -367,7 +367,7 @@ def test_duplicate_dict_args(caplog_vllm, parser):
         "--hf-overrides.key1",
         "val2",
         "-O1",
-        "-O.level",
+        "-O.mode",
         "2",
         "-O3",
     ]
@@ -375,12 +375,12 @@ def test_duplicate_dict_args(caplog_vllm, parser):
     parsed_args = parser.parse_args(args)
     # Should be the last value
     assert parsed_args.hf_overrides == {"key1": "val2"}
-    assert parsed_args.compilation_config == {"level": 3}
+    assert parsed_args.compilation_config == {"mode": 3}
 
     assert len(caplog_vllm.records) == 1
     assert "duplicate" in caplog_vllm.text
     assert "--hf-overrides.key1" in caplog_vllm.text
-    assert "-O.level" in caplog_vllm.text
+    assert "-O.mode" in caplog_vllm.text
 
 
 @pytest.mark.parametrize(
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 59841a446db3e..02fa27e3f05f7 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -11,7 +11,7 @@ from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
 from vllm.config import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     ParallelConfig,
     SchedulerConfig,
@@ -42,7 +42,7 @@ def _create_vllm_config(
     mock_config.parallel_config = ParallelConfig()
 
     # Mimic the behavior of VllmConfig.__post_init__()
-    if compilation_config.level == CompilationLevel.PIECEWISE:
+    if compilation_config.mode == CompilationMode.VLLM_COMPILE:
         compilation_config.set_splitting_ops_for_v1()
 
     return mock_config
@@ -50,23 +50,23 @@ def _create_vllm_config(
 
 class TestCudagraphDispatcher:
     @pytest.mark.parametrize(
-        "case_id,cudagraph_mode_str,compilation_level",
+        "case_id,cudagraph_mode_str,compilation_mode",
         [
             # Test case 0: Full CG for mixed batches, no separate routine
-            (0, "FULL", CompilationLevel.NO_COMPILATION),
+            (0, "FULL", CompilationMode.NONE),
             # Test case 1: Full CG for uniform batches, piecewise for mixed
-            (1, "FULL_AND_PIECEWISE", CompilationLevel.NO_COMPILATION),
+            (1, "FULL_AND_PIECEWISE", CompilationMode.NONE),
             # Test case 2: Full CG for uniform batches, no CG for mixed
-            (2, "FULL_DECODE_ONLY", CompilationLevel.NO_COMPILATION),
-            # Test case 3: Piecewise for all
-            (3, "PIECEWISE", CompilationLevel.PIECEWISE),
+            (2, "FULL_DECODE_ONLY", CompilationMode.NONE),
+            # Test case 3: PIECEWISE for all
+            (3, "PIECEWISE", CompilationMode.VLLM_COMPILE),
         ],
     )
-    def test_dispatcher(self, cudagraph_mode_str, compilation_level):
+    def test_dispatcher(self, cudagraph_mode_str, compilation_mode):
         # Setup dispatcher
         comp_config = CompilationConfig(
             cudagraph_mode=cudagraph_mode_str,
-            level=compilation_level,
+            mode=compilation_mode,
             cudagraph_capture_sizes=[1, 8],
         )
 
@@ -242,7 +242,7 @@ class TestCudagraphIntegration:
     def setup_method(self):
         # only FULL mode for non-uniform batches
         self.comp_config = CompilationConfig(
-            level=CompilationLevel.PIECEWISE,
+            mode=CompilationMode.VLLM_COMPILE,
             cudagraph_mode="FULL",
             cudagraph_capture_sizes=[10, 20],
         )
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index 8c8148ae20948..818ae1d7ba677 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -10,7 +10,7 @@ import pytest
 from tests.utils import wait_for_gpu_memory_to_clear
 from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
 from vllm import LLM
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, CompilationMode
 from vllm.platforms import current_platform
 
 
@@ -73,7 +73,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
             gpu_memory_utilization=0.45,
             max_model_len=1024,
             compilation_config=CompilationConfig(
-                level=3, cudagraph_mode=cudagraph_mode
+                mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode
             ),
         )
         llm.generate(["Hello, my name is"] * 10)
@@ -90,32 +90,27 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
     )
 
 
-# test cudagraph_mode with different compilation level.
-# (backend_name, cudagraph_mode, compilation_level, supported)
+# test cudagraph_mode with different compilation mode.
+# (backend_name, cudagraph_mode, compilation_mode, supported)
 combo_cases_2 = [
-    ("FA2", "FULL", 0, True),  # no compilation + full cudagraph
-    ("FA2", "FULL", 3, True),  # piecewise compilation + full cudagraph
-    ("FA2", "PIECEWISE", 0, False),  # no compilation + piecewise cudagraph
-    ("FA2", "PIECEWISE", 3, True),  # piecewise compilation + piecewise cudagraph
-    (
-        "FA2",
-        "FULL_AND_PIECEWISE",
-        0,
-        False,
-    ),  # piecewise cudagraph not supported without piecewise compilation
-    ("FA2", "FULL_AND_PIECEWISE", 3, True),
-    ("FA2", "FULL_DECODE_ONLY", 0, True),
-    ("FA2", "FULL_DECODE_ONLY", 3, True),
-    ("FA2", "NONE", 0, True),  # no compilation + no cudagraph
-    ("FA2", "NONE", 3, True),  # piecewise compilation + no cudagraph
+    ("FA2", "FULL", CompilationMode.NONE, True),
+    ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "PIECEWISE", CompilationMode.NONE, False),
+    ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+    ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+    ("FA2", "NONE", CompilationMode.NONE, True),
+    ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
 ]
 
 
 @pytest.mark.parametrize(
-    "backend_name,cudagraph_mode,compilation_level,supported", combo_cases_2
+    "backend_name,cudagraph_mode,compilation_mode,supported", combo_cases_2
 )
 def test_cudagraph_compilation_combo(combo_case):
-    backend_name, cudagraph_mode, compilation_level, supported = combo_case
+    backend_name, cudagraph_mode, compilation_mode, supported = combo_case
 
     env_vars = backend_configs[backend_name].env_vars
 
@@ -130,7 +125,7 @@ def test_cudagraph_compilation_combo(combo_case):
             gpu_memory_utilization=0.45,
             max_model_len=1024,
             compilation_config=CompilationConfig(
-                level=compilation_level, cudagraph_mode=cudagraph_mode
+                mode=compilation_mode, cudagraph_mode=cudagraph_mode
             ),
         )
         llm.generate(["Hello, my name is"] * 10)
diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index 89e5f26ac627f..f2c6d1c1fd1a4 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -7,7 +7,7 @@ import pytest
 import torch
 
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationConfig, CompilationLevel
+from vllm.config import CompilationConfig, CompilationMode
 from vllm.distributed import cleanup_dist_env_and_memory
 
 from ...utils import fork_new_process_for_each_test
@@ -75,9 +75,9 @@ def test_kv_sharing_fast_prefill(
         # This allows vLLM compilation backend to handle allocating and
         # managing buffers for cudagraph
         cudagraph_copy_inputs=True,
-        level=CompilationLevel.PIECEWISE
+        mode=CompilationMode.VLLM_COMPILE
         if not enforce_eager
-        else CompilationLevel.NO_COMPILATION,
+        else CompilationMode.NONE,
     )
 
     with monkeypatch.context() as m:
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 46c433fe6aefb..91be7e85af518 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -56,7 +56,7 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
             return InductorAdaptor()
     else:
         assert compilation_config.backend == "eager", (
-            "Custom backends not supported with CompilationLevel.PIECEWISE"
+            "Custom backends not supported with CompilationMode.VLLM_COMPILE"
         )
 
         logger.debug("Using EagerAdaptor")
@@ -481,7 +481,7 @@ def set_model_tag(tag: str):
 
 class VllmBackend:
     """The compilation backend for `torch.compile` with vLLM.
-    It is used for compilation level of `CompilationLevel.PIECEWISE`,
+    It is used for compilation mode of `CompilationMode.VLLM_COMPILE`,
     where we customize the compilation.
 
     The major work of this backend is to split the graph into
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 4553007027e39..e2369a635ad1f 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -575,7 +575,7 @@ class InductorAdaptor(CompilerInterface):
 
         Because it is re-entrant, we always set it (even if entering via Dynamo
         and the context was already entered). We might want to revisit if it
-        should be set at a different level of compilation.
+        should be set at a different mode of compilation.
 
         This is likely a bug in PyTorch: public APIs should not rely on
         manually setting up internal contexts. But we also rely on non-public
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index 9e8de831bcb29..20918099f169d 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -27,8 +27,8 @@ class CompilationCounter:
     num_cache_entries_updated: int = 0
     # The number of standalone_compile compiled artifacts saved
     num_compiled_artifacts_saved: int = 0
-    # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS
-    dynamo_as_is_count: int = 0
+    # Number of times a model was loaded with CompilationMode.STOCK_TORCH_COMPILE
+    stock_torch_compile_count: int = 0
 
     def clone(self) -> "CompilationCounter":
         return copy.deepcopy(self)
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index fe19d4e851294..20d4681e2c789 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -18,7 +18,7 @@ from torch._dynamo.symbolic_convert import InliningInstructionTranslator
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationLevel, VllmConfig, set_current_vllm_config
+from vllm.config import CompilationMode, VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils import resolve_obj_by_qualname, supports_dynamo
@@ -233,11 +233,11 @@ def _support_torch_compile(
         old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
         self.vllm_config = vllm_config
         enable_compile = enable_if is None or enable_if(vllm_config)
-        # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
+        # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner
         # will handle the compilation, so we don't need to do anything here.
         self.do_not_compile = (
-            vllm_config.compilation_config.level
-            in [CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS]
+            vllm_config.compilation_config.mode
+            in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE]
             or not supports_dynamo()
             or _should_ignore_torch_compile(self.__class__)
             or not enable_compile
@@ -247,7 +247,7 @@ def _support_torch_compile(
 
         compilation_counter.num_models_seen += 1
         TorchCompileWrapperWithCustomDispatcher.__init__(
-            self, compilation_level=vllm_config.compilation_config.level
+            self, compilation_mode=vllm_config.compilation_config.mode
         )
 
     cls.__init__ = __init__
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index d3c437795fabb..1e6d0e79228b0 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -3,7 +3,7 @@
 
 import time
 
-from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.config import CompilationConfig, CompilationMode, VllmConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -18,7 +18,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):
 
     compilation_config: CompilationConfig = vllm_config.compilation_config
     path = vllm_config.compile_debug_dump_path()
-    if compilation_config.level == CompilationLevel.PIECEWISE and path:
+    if compilation_config.mode == CompilationMode.VLLM_COMPILE and path:
         import depyf
 
         path.mkdir(parents=True, exist_ok=True)
@@ -29,7 +29,7 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig):
 
 def end_monitoring_torch_compile(vllm_config: VllmConfig):
     compilation_config: CompilationConfig = vllm_config.compilation_config
-    if compilation_config.level == CompilationLevel.PIECEWISE:
+    if compilation_config.mode == CompilationMode.VLLM_COMPILE:
         logger.info(
             "torch.compile takes %.2f s in total", compilation_config.compilation_time
         )
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index b4a0d89af0d6d..4b10c85209f63 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -11,7 +11,7 @@ from types import CodeType
 import torch
 
 import vllm.envs as envs
-from vllm.config import CompilationLevel, CUDAGraphMode, get_current_vllm_config
+from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -31,7 +31,7 @@ class TorchCompileWrapperWithCustomDispatcher:
     """
 
     def __init__(
-        self, compiled_callable: Callable | None = None, compilation_level: int = 0
+        self, compiled_callable: Callable | None = None, compilation_mode: int = 0
     ):
         vllm_config = get_current_vllm_config()
         self.vllm_config = vllm_config
@@ -72,7 +72,7 @@ class TorchCompileWrapperWithCustomDispatcher:
         # subclasses can use this to switch between the custom dispatcher
         # and the default Dynamo guard mechanism.
         self.use_custom_dispatcher: bool = (
-            compilation_level >= CompilationLevel.DYNAMO_ONCE
+            compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE
         )
 
     def aot_compile(self, *args, **kwargs):
@@ -85,7 +85,7 @@ class TorchCompileWrapperWithCustomDispatcher:
         return self.compiled_callable.aot_compile((args, kwargs))
 
     def __call__(self, *args, **kwargs):
-        """Implement the dispatch logic here, beyond the torch.compile level.
+        """Implement the dispatch logic here, beyond the torch.compile mode.
         NOTE: this function can have additional arguments beyond the forward
          method, for directly dispatching to the compiled code.
         """
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 6a0197d044dcd..7f1cc52024205 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -4,7 +4,7 @@
 from vllm.config.cache import CacheConfig
 from vllm.config.compilation import (
     CompilationConfig,
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     PassConfig,
 )
@@ -49,7 +49,7 @@ __all__ = [
     "CacheConfig",
     # From vllm.config.compilation
     "CompilationConfig",
-    "CompilationLevel",
+    "CompilationMode",
     "CUDAGraphMode",
     "PassConfig",
     # From vllm.config.device
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index fb80835ba48a1..a34fb0bf920c0 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -26,12 +26,20 @@ else:
 logger = init_logger(__name__)
 
 
-class CompilationLevel:
-    # constants for the levels of the compilation process
-    NO_COMPILATION = 0
-    DYNAMO_AS_IS = 1
-    DYNAMO_ONCE = 2
-    PIECEWISE = 3
+class CompilationMode:
+    """The compilation approach used for torch.compile-based compilation of the
+    model."""
+
+    NONE = 0
+    """No torch.compile compilation is applied, model runs in fully eager pytorch mode.
+    The model runs as-is."""
+    STOCK_TORCH_COMPILE = 1
+    """The standard `torch.compile` compilation pipeline."""
+    DYNAMO_TRACE_ONCE = 2
+    """Single Dynamo trace through the model, avoiding recompilation."""
+    VLLM_COMPILE = 3
+    """Custom vLLM Inductor-based backend with caching, piecewise compilation,
+    shape specialization, and custom passes."""
 
 
 class CUDAGraphMode(enum.Enum):
@@ -134,7 +142,7 @@ class CompilationConfig:
     """Configuration for compilation. It has three parts:
 
     - Top-level Compilation control:
-        - [`level`][vllm.config.CompilationConfig.level]
+        - [`mode`][vllm.config.CompilationConfig.mode]
         - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
         - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
         - [`backend`][vllm.config.CompilationConfig.backend]
@@ -171,14 +179,26 @@ class CompilationConfig:
 
     # Top-level Compilation control
     level: int | None = None
-    """The level of compilation:
+    """
+    Level is deprecated and will be removed in the next release,
+    either 0.12.0 or 0.11.2 whichever is soonest.
+    Please use mode. Currently all levels are mapped to mode.
+    """
+    # Top-level Compilation control
+    mode: int | None = None
+    """The compilation approach used for torch.compile-based compilation of the
+    model.
 
-    - None: If None, we will select the default compilation level.
-      For V1 engine this is 3, for V0 engine this is 0.
-    - 0: no compilation.
-    - 1: dynamo as is.
-    - 2: dynamo once.
-    - 3: piecewise compilation."""
+    - None: If None, we will select the default compilation mode.
+      For V1 engine this is 3.
+    - 0: NONE: No torch.compile compilation is applied, model runs in fully
+         eager pytorch mode. The model runs as-is.
+    - 1: STOCK_TORCH_COMPILE: The standard `torch.compile` compilation pipeline.
+    - 2: DYNAMO_TRACE_ONCE: Single Dynamo trace through the model, avoiding
+         recompilation by removing guards.
+         Requires no dynamic-shape-dependent control-flow.
+    - 3: VLLM_COMPILE: Custom vLLM Inductor-based backend with caching,
+         piecewise compilation, shape specialization, and custom passes."""
     debug_dump_path: Path | None = None
     """The path to dump the debug information."""
     cache_dir: str = ""
@@ -195,11 +215,11 @@ class CompilationConfig:
 
     backend function.
     We use string to avoid serialization issues when using compilation in a
-    distributed setting. When the compilation level is 1 or 2, the backend is
+    distributed setting. When the compilation mode is 1 or 2, the backend is
     used for the compilation directly (it sees the whole graph). When the
-    compilation level is 3, the backend is used for the piecewise compilation
+    compilation mode is 3, the backend is used for the piecewise compilation
     (it sees a part of the graph). The backend can not be custom for compilation
-    level 3, i.e. the backend must be either eager or inductor. Furthermore,
+    mode 3, i.e. the backend must be either eager or inductor. Furthermore,
     compilation is only piecewise if splitting ops is set accordingly and
     use_inductor_graph_partition is off. Note that the default options for
     splitting ops are sufficient for piecewise compilation.
@@ -214,7 +234,7 @@ class CompilationConfig:
     - 'none,+op1,+op2' to enable only op1 and op2
 
     By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
+    disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
     Inductor generates (fused) Triton kernels for disabled custom ops."""
     splitting_ops: list[str] | None = None
     """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@@ -249,7 +269,7 @@ class CompilationConfig:
         One graph for symbolic shape and one graph per size in compile_sizes
         are compiled using configurations in inductor_compile_config.
 
-    This setting is ignored if level<PIECEWISE.
+    This setting is ignored if mode<VLLM_COMPILE.
 
     For future compatibility:
     If use_inductor is True, backend="inductor" otherwise backend="eager".
@@ -299,7 +319,7 @@ class CompilationConfig:
     Currently, the cudagraph mode is only used for the v1 engine.
     Note that the cudagraph logic is generally orthogonal to the 
     compilation logic. While piecewise cudagraphs require piecewise 
-    compilation (level=PIECEWISE and non-empty splitting_ops), full
+    compilation (mode=VLLM_COMPILE and non-empty splitting_ops), full
     cudagraphs are supported with and without compilation.
     
     Warning: This flag is new and subject to change in addition 
@@ -312,7 +332,7 @@ class CompilationConfig:
         that all input buffers have fixed addresses, and all
         splitting ops write their outputs to input buffers.
     In the vLLM V1 Engine, this flag only applies for
-    CompilationLevel.PIECEWISE (aka -O3).
+    CompilationMode.VLLM_COMPILE (aka -O3).
     Note that this is orthogonal to the cudagraph capture logic
     outside of compilation.
     Warning: This flag is deprecated and will be removed in the next major or
@@ -426,7 +446,7 @@ class CompilationConfig:
         the final hidden states.
         """
         factors: list[Any] = []
-        factors.append(self.level)
+        factors.append(self.mode)
         factors.append(self.backend)
         factors.append(self.custom_ops)
         factors.append(self.splitting_ops)
@@ -477,6 +497,17 @@ class CompilationConfig:
         return value
 
     def __post_init__(self) -> None:
+        if self.level is not None:
+            logger.warning(
+                "Level is deprecated and will be removed in the next release,"
+                "either 0.12.0 or 0.11.2 whichever is soonest."
+                "Use mode instead."
+                "If both level and mode are given,"
+                "only mode will be used."
+            )
+            if self.mode is None:
+                self.mode = self.level
+
         count_none = self.custom_ops.count("none")
         count_all = self.custom_ops.count("all")
         assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
@@ -574,7 +605,7 @@ class CompilationConfig:
         # Currently only eager and inductor backend are supported.
         # for piecewise compilation. Custom backends are not suppported for
         # piecewise compilation. Update when more backends are supported.
-        if self.level == CompilationLevel.PIECEWISE and self.backend not in [
+        if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [
             "",
             "eager",
             "inductor",
@@ -602,24 +633,27 @@ class CompilationConfig:
         Returns:
             The backend for the compilation config.
         """
-        if self.level is None:
+        if self.mode is None:
             raise ValueError(
-                "No compilation level is set. This method should only be \
+                "No compilation mode is set. This method should only be \
                 called via vllm config where the level is set if none is \
                 provided."
             )
-        if self.level == CompilationLevel.NO_COMPILATION:
-            raise ValueError("No compilation level is set.")
+        if self.mode == CompilationMode.NONE:
+            raise ValueError("No compilation mode is set.")
 
         from torch._dynamo.backends.registry import list_backends
 
         torch_backends = list_backends(exclude_tags=tuple())
-        if self.level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
+        if self.mode in [
+            CompilationMode.STOCK_TORCH_COMPILE,
+            CompilationMode.DYNAMO_TRACE_ONCE,
+        ]:
             if self.backend in torch_backends:
                 return self.backend
             return resolve_obj_by_qualname(self.backend)
 
-        assert self.level == CompilationLevel.PIECEWISE
+        assert self.mode == CompilationMode.VLLM_COMPILE
         if self.backend not in ["eager", "inductor"]:
             raise ValueError(
                 f"Invalid backend for piecewise compilation: {self.backend}"
@@ -684,11 +718,11 @@ class CompilationConfig:
         self.bs_to_padded_graph_size[self.max_capture_size] = self.max_capture_size
 
     def set_splitting_ops_for_v1(self):
-        # NOTE: this function needs to be called only when level is
-        # CompilationLevel.PIECEWISE
-        assert self.level == CompilationLevel.PIECEWISE, (
+        # NOTE: this function needs to be called only when mode is
+        # CompilationMode.VLLM_COMPILE
+        assert self.mode == CompilationMode.VLLM_COMPILE, (
             "set_splitting_ops_for_v1 should only be called when "
-            "level is CompilationLevel.PIECEWISE"
+            "mode is CompilationMode.VLLM_COMPILE"
         )
 
         if self.use_inductor_graph_partition:
@@ -769,12 +803,10 @@ class CompilationConfig:
 
         if not self.use_inductor_graph_partition:
             # Dynamo-level FX split case
-            return self.level == CompilationLevel.PIECEWISE
+            return self.mode == CompilationMode.VLLM_COMPILE
 
         # Inductor partition case
-        return (
-            self.backend == "inductor" and self.level > CompilationLevel.NO_COMPILATION
-        )
+        return self.backend == "inductor" and self.mode > CompilationMode.NONE
 
     def custom_op_log_check(self):
         """
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index b0ed12894065d..dabd06c320543 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -22,7 +22,7 @@ from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid
 
 from .cache import CacheConfig
-from .compilation import CompilationConfig, CompilationLevel, CUDAGraphMode
+from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
 from .device import DeviceConfig
 from .kv_events import KVEventsConfig
 from .kv_transfer import KVTransferConfig
@@ -84,17 +84,11 @@ class VllmConfig:
     compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
     """`torch.compile` and cudagraph capture configuration for the model.
 
-    As a shorthand, `-O<n>` can be used to directly specify the compilation
-    level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`).
-    Currently, -O <n> and -O=<n> are supported as well but this will likely be
-    removed in favor of clearer -O<n> syntax in the future.
-
-    NOTE: level 0 is the default level without any optimization. level 1 and 2
-    are for internal testing only. level 3 is the recommended level for
-    production, also default in V1.
+    As a shorthand, one can append compilation arguments via 
+    -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`).
 
     You can specify the full compilation config like so:
-    `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
+    `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
     """
     kv_transfer_config: KVTransferConfig | None = None
     """The configurations for distributed KV cache transfer."""
@@ -305,33 +299,33 @@ class VllmConfig:
                 "precision for chunked prefill triton kernels."
             )
 
-        # If the user does not explicitly set a compilation level, then
-        # we use the default level. The default level depends on other
+        # If the user does not explicitly set a compilation mode, then
+        # we use the default mode. The default mode depends on other
         # settings (see the below code).
-        if self.compilation_config.level is None:
+        if self.compilation_config.mode is None:
             if envs.VLLM_USE_V1:
                 if (
                     self.model_config is not None
                     and not self.model_config.enforce_eager
                 ):
-                    self.compilation_config.level = CompilationLevel.PIECEWISE
+                    self.compilation_config.mode = CompilationMode.VLLM_COMPILE
                 else:
-                    self.compilation_config.level = CompilationLevel.NO_COMPILATION
+                    self.compilation_config.mode = CompilationMode.NONE
 
             else:
-                # NB: Passing both --enforce-eager and a compilation level
-                # in V0 means the compilation level wins out.
-                self.compilation_config.level = CompilationLevel.NO_COMPILATION
+                # NB: Passing both --enforce-eager and a compilation mode
+                # in V0 means the compilation mode wins out.
+                self.compilation_config.mode = CompilationMode.NONE
         else:
-            assert self.compilation_config.level >= CompilationLevel.NO_COMPILATION
-            assert self.compilation_config.level <= CompilationLevel.PIECEWISE
+            assert self.compilation_config.mode >= CompilationMode.NONE
+            assert self.compilation_config.mode <= CompilationMode.VLLM_COMPILE
 
         # If user does not set custom ops via none or all set it here based on
-        # compilation level and backend.
+        # compilation mode and backend.
         if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
             if (
                 self.compilation_config.backend == "inductor"
-                and self.compilation_config.level > CompilationLevel.NO_COMPILATION
+                and self.compilation_config.mode > CompilationMode.NONE
             ):
                 self.compilation_config.custom_ops.append("none")
             else:
@@ -350,7 +344,7 @@ class VllmConfig:
             if self.compilation_config.cudagraph_mode is None:
                 if (
                     envs.VLLM_USE_V1
-                    and self.compilation_config.level == CompilationLevel.PIECEWISE
+                    and self.compilation_config.mode == CompilationMode.VLLM_COMPILE
                 ):
                     # default to full and piecewise for most models
                     self.compilation_config.cudagraph_mode = (
@@ -486,10 +480,10 @@ class VllmConfig:
             )
         current_platform.check_and_update_config(self)
 
-        # Do this after all the updates to compilation_config.level
+        # Do this after all the updates to compilation_config.mode
         if (
             envs.VLLM_USE_V1
-            and self.compilation_config.level == CompilationLevel.PIECEWISE
+            and self.compilation_config.mode == CompilationMode.VLLM_COMPILE
         ):
             self.compilation_config.set_splitting_ops_for_v1()
 
@@ -508,8 +502,8 @@ class VllmConfig:
                 )
 
             if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
-                assert self.compilation_config.level == CompilationLevel.PIECEWISE, (
-                    "Compilation level should be CompilationLevel.PIECEWISE "
+                assert self.compilation_config.mode == CompilationMode.VLLM_COMPILE, (
+                    "Compilation mode should be CompilationMode.VLLM_COMPILE "
                     "when cudagraph_mode piecewise cudagraphs is used, "
                     f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
                 )
@@ -837,7 +831,7 @@ def set_current_vllm_config(
 
         if (
             check_compile
-            and vllm_config.compilation_config.level == CompilationLevel.PIECEWISE
+            and vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE
             and compilation_counter.num_models_seen == num_models_seen
         ):
             # If the model supports compilation,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 668344fdcc34c..61376736d0f7a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -176,7 +176,7 @@ class LLM:
             argument is deprecated and will be removed in v0.12.0 or v1.0.0,
             whichever is sooner.
         compilation_config: Either an integer or a dictionary. If it is an
-            integer, it is used as the level of compilation optimization. If it
+            integer, it is used as the mode of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
         **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].
 
@@ -257,9 +257,7 @@ class LLM:
 
         if compilation_config is not None:
             if isinstance(compilation_config, int):
-                compilation_config_instance = CompilationConfig(
-                    level=compilation_config
-                )
+                compilation_config_instance = CompilationConfig(mode=compilation_config)
             elif isinstance(compilation_config, dict):
                 compilation_config_instance = CompilationConfig(
                     **{
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 44feb24a1eefc..4fda4d76a9808 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -8,7 +8,7 @@ from packaging import version
 
 from vllm import _custom_ops as ops
 from vllm import envs
-from vllm.config import CompilationLevel, get_current_vllm_config
+from vllm.config import CompilationMode, get_current_vllm_config
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
@@ -419,7 +419,7 @@ class Fp8LinearOp:
         if pad_output is None:
             config = get_current_vllm_config().compilation_config
             pad_output = (
-                config.level < CompilationLevel.PIECEWISE
+                config.mode < CompilationMode.VLLM_COMPILE
                 and self.preferred_backend == "torch"
             )
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 17d610ac16a39..1a34e9150ce73 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -247,12 +247,12 @@ class CpuPlatform(Platform):
             parallel_config.enable_dbo = False
 
         # Note: workaround for v1 gpu_model_runner
-        from vllm.config import CompilationLevel
+        from vllm.config import CompilationMode
 
         vllm_config.compilation_config.cudagraph_capture_sizes = []
 
         compilation_config = vllm_config.compilation_config
-        if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE:
+        if vllm_config.compilation_config.mode == CompilationMode.VLLM_COMPILE:
             # Note: vLLM V1 is using PIECEWISE level compilation, which will
             # take time to compile kernels just-in-time with the inductor
             # backend. For CPU CI tests, most of them are executed fast and
@@ -265,7 +265,7 @@ class CpuPlatform(Platform):
             else:
                 backend = "inductor"
 
-            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
             compilation_config.backend = backend
             compilation_config.inductor_compile_config.update(
                 {
@@ -277,7 +277,7 @@ class CpuPlatform(Platform):
             )
 
         if vllm_config.lora_config is not None:
-            compilation_config.level = CompilationLevel.NO_COMPILATION
+            compilation_config.mode = CompilationMode.NONE
 
         assert vllm_config.device_config.device_type == "cpu"
 
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index dcd595cf9082f..ed38f3bc30878 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -114,7 +114,7 @@ class TpuPlatform(Platform):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        from vllm.config import CompilationLevel, CUDAGraphMode
+        from vllm.config import CompilationMode, CUDAGraphMode
 
         cache_config = vllm_config.cache_config
         # For v0, the default block size is 16.
@@ -122,12 +122,13 @@ class TpuPlatform(Platform):
             cache_config.block_size = cast(BlockSize, 16)
         compilation_config = vllm_config.compilation_config
 
-        # TPU only supports DYNAMO_ONCE compilation level
-        if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
+        # TPU only supports DYNAMO_TRACE_ONCE compilation mode
+        if compilation_config.mode != CompilationMode.DYNAMO_TRACE_ONCE:
             logger.info(
-                "[TPU] Forcing DYNAMO_ONCE compilation level, and disabling cudagraph."
+                "[TPU] Forcing DYNAMO_TRACE_ONCE compilation mode, and\
+                disabling cudagraph."
             )
-            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+            compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
 
         if (
             compilation_config.cudagraph_mode is None
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index dcfc970d3a83d..4638e9fa30216 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -144,7 +144,7 @@ class XPUPlatform(Platform):
             cache_config.block_size = 64
 
         # lazy import to avoid circular import
-        from vllm.config import CompilationLevel, CUDAGraphMode
+        from vllm.config import CompilationMode, CUDAGraphMode
 
         compilation_config = vllm_config.compilation_config
         if compilation_config.compile_sizes is None:
@@ -155,7 +155,7 @@ class XPUPlatform(Platform):
         )
 
         if vllm_config.lora_config is not None:
-            compilation_config.level = CompilationLevel.NO_COMPILATION
+            compilation_config.mode = CompilationMode.NONE
 
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index c8da83047a406..bb5d3a688094f 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1686,16 +1686,16 @@ class FlexibleArgumentParser(ArgumentParser):
             elif arg.startswith("-O") and arg != "-O" and arg[2] != ".":
                 # allow -O flag to be used without space, e.g. -O3 or -Odecode
                 # -O.<...> handled later
-                # also handle -O=<level> here
-                level = arg[3:] if arg[2] == "=" else arg[2:]
-                processed_args.append(f"-O.level={level}")
+                # also handle -O=<mode> here
+                mode = arg[3:] if arg[2] == "=" else arg[2:]
+                processed_args.append(f"-O.mode={mode}")
             elif (
                 arg == "-O"
                 and i + 1 < len(args)
                 and args[i + 1] in {"0", "1", "2", "3"}
             ):
-                # Convert -O <n> to -O.level <n>
-                processed_args.append("-O.level")
+                # Convert -O <n> to -O.mode <n>
+                processed_args.append("-O.mode")
             else:
                 processed_args.append(arg)
 
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 9f071a0ddac22..a12704b664c3d 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -43,12 +43,12 @@ class CudagraphDispatcher:
             not_use_piecewise_compilation
             or self.compilation_config.is_attention_compiled_piecewise()
         ), (
-            "Compilation level should be CompilationLevel.PIECEWISE when "
+            "Compilation mode should be CompilationMode.VLLM_COMPILE when "
             "cudagraph_mode piecewise cudagraphs is used, "
             "and attention should be in splitting_ops or "
             "inductor splitting should be used. "
             f"cudagraph_mode={self.cudagraph_mode}, "
-            f"compilation_level={self.compilation_config.level}, "
+            f"compilation_mode={self.compilation_config.mode}, "
             f"splitting_ops={self.compilation_config.splitting_ops}"
         )
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index ad504da55fd8c..6d5d0b2614fa7 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -9,7 +9,7 @@ import torch
 import torch.nn as nn
 
 from vllm.config import (
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     VllmConfig,
     get_layers_from_vllm_config,
@@ -86,7 +86,7 @@ class EagleProposer:
         self.use_cuda_graph = False
 
         compilation_config = self.vllm_config.compilation_config
-        if compilation_config.level == CompilationLevel.PIECEWISE:
+        if compilation_config.mode == CompilationMode.VLLM_COMPILE:
             cudagraph_mode = compilation_config.cudagraph_mode
             if cudagraph_mode != CUDAGraphMode.NONE and not cudagraph_mode.has_mode(
                 CUDAGraphMode.PIECEWISE
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 72f8824e20054..d995a609318cd 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -25,7 +25,7 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
 from vllm.config import (
-    CompilationLevel,
+    CompilationMode,
     CUDAGraphMode,
     VllmConfig,
     get_layers_from_vllm_config,
@@ -2927,14 +2927,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             )
 
         if (
-            self.vllm_config.compilation_config.level == CompilationLevel.DYNAMO_AS_IS
+            self.vllm_config.compilation_config.mode
+            == CompilationMode.STOCK_TORCH_COMPILE
             and supports_dynamo()
         ):
             backend = self.vllm_config.compilation_config.init_backend(self.vllm_config)
-            compilation_counter.dynamo_as_is_count += 1
+            compilation_counter.stock_torch_compile_count += 1
             self.model.compile(fullgraph=True, backend=backend)
             return
-        # for other compilation levels, cudagraph behavior is controlled by
+        # for other compilation modes, cudagraph behavior is controlled by
         # CudagraphWraper and CudagraphDispatcher of vllm.
 
         # wrap the model with full cudagraph wrapper if needed.
@@ -3985,7 +3986,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 # if not supported any full cudagraphs, just raise it.
                 msg += (
                     "; please try cudagraph_mode=PIECEWISE, and "
-                    "make sure compilation level is piecewise"
+                    "make sure compilation mode is VLLM_COMPILE"
                 )
                 raise ValueError(msg)
 
@@ -4012,7 +4013,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 f"with {min_cg_builder_name} backend (support: "
                 f"{min_cg_support})"
             )
-            if self.compilation_config.level == CompilationLevel.PIECEWISE and (
+            if self.compilation_config.mode == CompilationMode.VLLM_COMPILE and (
                 self.compilation_config.splitting_ops_contain_attention()
                 or self.compilation_config.use_inductor_graph_partition
             ):
@@ -4068,7 +4069,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 f"supported with {min_cg_builder_name} backend ("
                 f"support:{min_cg_support}) "
                 "; please try cudagraph_mode=PIECEWISE, "
-                "and make sure compilation level is piecewise"
+                "and make sure compilation mode is VLLM_COMPILE"
             )
 
         # Trigger cudagraph dispatching keys initialization here (after

From a2986b3e337cb83676700fd4e76a6548e96e874b Mon Sep 17 00:00:00 2001
From: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com>
Date: Tue, 14 Oct 2025 19:54:43 -0700
Subject: [PATCH 12/51] [Bugfix] Fixes prefix-repetition benchmark script
 (#26828)

Signed-off-by: Kourosh Hakhamaneshi <Kourosh@anyscale.com>
---
 vllm/benchmarks/datasets.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index d610389ddb6b0..20a15bbc31e38 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -2979,13 +2979,14 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
         requests = []
         token_mismatch_total = 0
         for _ in range(num_prefixes):
-            prefix_tokens = _generate_exact_length_tokens(prefix_len)
+            prefix_tokens, prefix_mismatch = _generate_exact_length_tokens(prefix_len)
+            token_mismatch_total += prefix_mismatch
 
             for _ in range(prompts_per_prefix):
-                suffix_tokens, token_mistmatch = _generate_exact_length_tokens(
+                suffix_tokens, suffix_mismatch = _generate_exact_length_tokens(
                     suffix_len
                 )
-                token_mismatch_total += token_mistmatch
+                token_mismatch_total += suffix_mismatch
                 combined_tokens = prefix_tokens + suffix_tokens
                 prompt = tokenizer.decode(combined_tokens)
                 prompt_len = len(combined_tokens)

From 85a65e7f51ad6901979ff43c95deb6ac727a9430 Mon Sep 17 00:00:00 2001
From: Tao Hui <taohui3@gmail.com>
Date: Wed, 15 Oct 2025 11:09:52 +0800
Subject: [PATCH 13/51] [Model] Add DeepSeek-V3.1 reasoning parser (split from
 PR #24972) (#25589)

Signed-off-by: taohui <taohui3@gmail.com>
Signed-off-by: Tao Hui <taohui3@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
---
 docs/features/reasoning_outputs.md            |  4 +-
 .../test_deepseekv3_reasoning_parser.py       | 76 +++++++++++++++++++
 vllm/entrypoints/openai/serving_chat.py       | 10 ++-
 vllm/reasoning/__init__.py                    |  4 +
 .../reasoning/deepseek_v3_reasoning_parser.py | 66 ++++++++++++++++
 vllm/reasoning/identity_reasoning_parser.py   | 58 ++++++++++++++
 6 files changed, 215 insertions(+), 3 deletions(-)
 create mode 100644 tests/reasoning/test_deepseekv3_reasoning_parser.py
 create mode 100644 vllm/reasoning/deepseek_v3_reasoning_parser.py
 create mode 100644 vllm/reasoning/identity_reasoning_parser.py

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index ab04a1efcc083..0b00b8805bb2c 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -11,6 +11,7 @@ vLLM currently supports the following reasoning models:
 | Model Series | Parser Name | Structured Output Support | Tool Calling |
 |--------------|-------------|------------------|-------------|
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
+| [DeepSeek-V3.1](https://huggingface.co/collections/deepseek-ai/deepseek-v31-68a491bed32bd77e7fca048f) | `deepseek_v3` | `json`, `regex` | ❌ |
 | [ERNIE-4.5-VL series](https://huggingface.co/baidu/ERNIE-4.5-VL-28B-A3B-PT) | `ernie45` | `json`, `regex` | ❌ |
 | [ERNIE-4.5-21B-A3B-Thinking](https://huggingface.co/baidu/ERNIE-4.5-21B-A3B-Thinking) | `ernie45` | `json`, `regex` | ✅ |
 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
@@ -20,8 +21,9 @@ vLLM currently supports the following reasoning models:
 | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
 
 !!! note
-    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
+    IBM Granite 3.2 and DeepSeek-V3.1 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
     The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
+    DeepSeek-V3.1 tool calling is supported in non-thinking mode.
 
 ## Quickstart
 
diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py
new file mode 100644
index 0000000000000..3d12f3e5b30e8
--- /dev/null
+++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.reasoning import (
+    DeepSeekR1ReasoningParser,
+    DeepSeekV3ReasoningParser,
+    IdentityReasoningParser,
+)
+
+REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-V3.1"
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+@pytest.mark.parametrize(
+    "thinking,expected_parser_type",
+    [
+        (True, DeepSeekR1ReasoningParser),
+        (False, IdentityReasoningParser),
+    ],
+)
+def test_parser_selection(tokenizer, thinking, expected_parser_type):
+    parser = DeepSeekV3ReasoningParser(
+        tokenizer, chat_template_kwargs={"thinking": thinking}
+    )
+
+    assert isinstance(parser._parser, expected_parser_type)
+
+
+def test_identity_reasoning_parser_basic(tokenizer):
+    parser = IdentityReasoningParser(tokenizer)
+
+    # Test is_reasoning_end always returns True
+    input_text = "This is some output"
+    input_tokens = tokenizer.tokenize(input_text)
+    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
+    assert parser.is_reasoning_end(input_ids) is True
+
+    # Test extract_content_ids returns all input_ids
+    assert parser.extract_content_ids(input_ids) == input_ids
+
+    # Test extract_reasoning_content returns (None, model_output)
+    request = ChatCompletionRequest(model="test-model", messages=[], temperature=1.0)
+    reasoning, content = parser.extract_reasoning_content(input_text, request)
+    assert reasoning is None
+    assert content == input_text
+
+    # Test extract_reasoning_content_streaming returns DeltaMessage or None
+    result = parser.extract_reasoning_content_streaming(
+        previous_text="",
+        current_text="Hello world",
+        delta_text="Hello world",
+        previous_token_ids=[],
+        current_token_ids=input_ids,
+        delta_token_ids=input_ids,
+    )
+    assert isinstance(result, DeltaMessage)
+    assert result.content == "Hello world"
+
+    # If delta_text is empty, should return None
+    result_none = parser.extract_reasoning_content_streaming(
+        previous_text="Hello world",
+        current_text="Hello world",
+        delta_text="",
+        previous_token_ids=input_ids,
+        current_token_ids=input_ids,
+        delta_token_ids=[],
+    )
+    assert result_none is None
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 26027112eb589..5dc7f7859226d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -570,7 +570,10 @@ class OpenAIServingChat(OpenAIServing):
 
         try:
             if self.reasoning_parser:
-                reasoning_parser = self.reasoning_parser(tokenizer)
+                reasoning_parser = self.reasoning_parser(
+                    tokenizer,
+                    chat_template_kwargs=request.chat_template_kwargs,  # type: ignore
+                )
         except RuntimeError as e:
             logger.exception("Error in reasoning parser creation.")
             data = self.create_streaming_error_response(str(e))
@@ -1335,7 +1338,10 @@ class OpenAIServingChat(OpenAIServing):
 
             if self.reasoning_parser:
                 try:
-                    reasoning_parser = self.reasoning_parser(tokenizer)
+                    reasoning_parser = self.reasoning_parser(
+                        tokenizer,
+                        chat_template_kwargs=request.chat_template_kwargs,  # type: ignore
+                    )
                 except RuntimeError as e:
                     logger.exception("Error in reasoning parser creation.")
                     return self.create_error_response(str(e))
diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py
index 10c990f361324..ecee1af439028 100644
--- a/vllm/reasoning/__init__.py
+++ b/vllm/reasoning/__init__.py
@@ -4,11 +4,13 @@
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .basic_parsers import BaseThinkingReasoningParser
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from .deepseek_v3_reasoning_parser import DeepSeekV3ReasoningParser
 from .ernie45_reasoning_parser import Ernie45ReasoningParser
 from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser
 from .gptoss_reasoning_parser import GptOssReasoningParser
 from .granite_reasoning_parser import GraniteReasoningParser
 from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser
+from .identity_reasoning_parser import IdentityReasoningParser
 from .mistral_reasoning_parser import MistralReasoningParser
 from .olmo3_reasoning_parser import Olmo3ReasoningParser
 from .qwen3_reasoning_parser import Qwen3ReasoningParser
@@ -20,6 +22,8 @@ __all__ = [
     "BaseThinkingReasoningParser",
     "ReasoningParserManager",
     "DeepSeekR1ReasoningParser",
+    "IdentityReasoningParser",
+    "DeepSeekV3ReasoningParser",
     "Ernie45ReasoningParser",
     "GraniteReasoningParser",
     "HunyuanA13BReasoningParser",
diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py
new file mode 100644
index 0000000000000..7116f90a1ac0a
--- /dev/null
+++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.logger import init_logger
+from vllm.reasoning import (
+    DeepSeekR1ReasoningParser,
+    ReasoningParser,
+    ReasoningParserManager,
+)
+
+from .identity_reasoning_parser import IdentityReasoningParser
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("deepseek_v3")
+class DeepSeekV3ReasoningParser(ReasoningParser):
+    """
+    V3 parser that delegates to either DeepSeekR1ReasoningParser or
+    IdentityReasoningParser based on `thinking` and `separate_reasoning`.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+
+        chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {}
+        thinking = bool(chat_kwargs.pop("thinking", False))
+
+        if thinking:
+            self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs)
+        else:
+            self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs)
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        return self._parser.is_reasoning_end(input_ids)
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return self._parser.extract_content_ids(input_ids)
+
+    def extract_reasoning_content(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
+        return self._parser.extract_reasoning_content(model_output, request)
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        return self._parser.extract_reasoning_content_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+        )
diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py
new file mode 100644
index 0000000000000..f1d17a71be338
--- /dev/null
+++ b/vllm/reasoning/identity_reasoning_parser.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser
+
+logger = init_logger(__name__)
+
+
+class IdentityReasoningParser(ReasoningParser):
+    """
+    Identity reasoning parser.
+
+    This parser does not attempt to parse or strip out reasoning tokens.
+    It treats the entire model output as content and ignores reasoning.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ReasoningParser "
+                "constructor during construction."
+            )
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        # Always return True, since we never treat reasoning specially
+        return True
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        # Identity: return all tokens as content
+        return input_ids
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        # Just wrap delta_text as content, ignore reasoning
+        if delta_text:
+            return DeltaMessage(content=delta_text)
+        return None
+
+    def extract_reasoning_content(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[str | None, str | None]:
+        # No reasoning separation: return None for reasoning_content,
+        # and full model_output as content
+        return None, model_output

From c43ca8259effad1735f0cf8821204247b6ac70ea Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 15 Oct 2025 11:35:08 +0800
Subject: [PATCH 14/51] [Docs] Move build.inc into arm.inc (#26862)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 .../installation/cpu/arm.inc.md               | 41 ++++++++++++++++-
 .../installation/cpu/build.inc.md             | 44 -------------------
 2 files changed, 40 insertions(+), 45 deletions(-)
 delete mode 100644 docs/getting_started/installation/cpu/build.inc.md

diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md
index 15fce69b44871..9cae9ed1a212e 100644
--- a/docs/getting_started/installation/cpu/arm.inc.md
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@@ -23,7 +23,46 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
---8<-- "docs/getting_started/installation/cpu/build.inc.md:extra-information"
+First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+
+```bash
+sudo apt-get update  -y
+sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+```
+
+Second, clone the vLLM project:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git vllm_source
+cd vllm_source
+```
+
+Third, install required dependencies:
+
+```bash
+uv pip install -r requirements/cpu-build.txt --torch-backend cpu
+uv pip install -r requirements/cpu.txt --torch-backend cpu
+```
+
+??? console "pip"
+    ```bash
+    pip install --upgrade pip
+    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    ```
+
+Finally, build and install vLLM:
+
+```bash
+VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
+```
+
+If you want to develop vLLM, install it in editable mode instead.
+
+```bash
+VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
+```
 
 Testing has been conducted on AWS Graviton3 instances for compatibility.
 
diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md
deleted file mode 100644
index f99497128fd37..0000000000000
--- a/docs/getting_started/installation/cpu/build.inc.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# --8<-- [start:extra-information]
-
-First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
-
-```bash
-sudo apt-get update  -y
-sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certificates gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof
-sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-```
-
-Second, clone the vLLM project:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git vllm_source
-cd vllm_source
-```
-
-Third, install required dependencies:
-
-```bash
-uv pip install -r requirements/cpu-build.txt --torch-backend cpu
-uv pip install -r requirements/cpu.txt --torch-backend cpu
-```
-
-??? console "pip"
-    ```bash
-    pip install --upgrade pip
-    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-    ```
-
-Finally, build and install vLLM:
-
-```bash
-VLLM_TARGET_DEVICE=cpu python setup.py install
-```
-
-If you want to develop vLLM, install it in editable mode instead.
-
-```bash
-VLLM_TARGET_DEVICE=cpu python setup.py develop
-```
-
-# --8<-- [end:extra-information]

From e471d7ca7ee8035a3595297324d7d83da4f79630 Mon Sep 17 00:00:00 2001
From: zhrrr <43847754+izhuhaoran@users.noreply.github.com>
Date: Wed, 15 Oct 2025 12:09:44 +0800
Subject: [PATCH 15/51] [CI/Build][Bugfix] fix qutlass cmake error when set
 QUTLASS_SRC_DIR (#26773)

Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 cmake/external_projects/qutlass.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake
index 9aace7693077a..5a59a409999ad 100644
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@@ -22,10 +22,10 @@ else()
     CONFIGURE_COMMAND ""
     BUILD_COMMAND ""
   )
-  FetchContent_Populate(qutlass)
-  set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
 endif()
 
+FetchContent_Populate(qutlass)
+
 if(NOT qutlass_SOURCE_DIR)
   message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
 endif()

From a27b288e4a389e3ece5e178bc0219c6c0e1db7d1 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 15 Oct 2025 12:23:44 +0800
Subject: [PATCH 16/51] [Feature] default --extra-body param to disable
 thinking in vllm bench serve (#26784)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/benchmarks/serve.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index c52e384a40023..3c85a1e8fdd9e 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -1230,6 +1230,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "the ready check will be skipped.",
     )
 
+    parser.add_argument(
+        "--extra-body",
+        help="A JSON string representing extra body parameters to include "
+        "in each request."
+        'Example: \'{"chat_template_kwargs":{"enable_thinking":false}}\'',
+        type=json.loads,
+        default=None,
+    )
+
 
 def main(args: argparse.Namespace) -> dict[str, Any]:
     return asyncio.run(main_async(args))
@@ -1330,6 +1339,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     else:
         sampling_params = {}
 
+    extra_body = args.extra_body or {}
+    extra_body = {**sampling_params, **extra_body}
+
     # Avoid GC processing "static" data - reduce pause times.
     gc.collect()
     gc.freeze()
@@ -1355,7 +1367,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         max_concurrency=args.max_concurrency,
         lora_modules=args.lora_modules,
         extra_headers=headers,
-        extra_body=sampling_params,
+        extra_body=extra_body,
         ramp_up_strategy=args.ramp_up_strategy,
         ramp_up_start_rps=args.ramp_up_start_rps,
         ramp_up_end_rps=args.ramp_up_end_rps,

From 7cfa420f4927a9e3fa4f533b2169d51597329a96 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Tue, 14 Oct 2025 22:04:32 -0700
Subject: [PATCH 17/51] [BugFix] Patch inductor partitioning logic (#26735)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 vllm/env_override.py | 118 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/vllm/env_override.py b/vllm/env_override.py
index 7f9054e738463..eb51dee1cf033 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -3,6 +3,7 @@
 import os
 
 import torch
+from packaging import version
 
 from vllm.logger import init_logger
 
@@ -21,3 +22,120 @@ os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1"
 os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 # see https://github.com/vllm-project/vllm/issues/10619
 torch._inductor.config.compile_threads = 1
+
+
+# ========================================
+# torch 2.9 Inductor Scheduler monkeypatch
+# ========================================
+# This change monkeypatches a function in Inductor to work around the following
+# bug: https://github.com/vllm-project/vllm/issues/26678
+#
+# The bug occurs when `use_inductor_graph_partition` is turned on and there
+# exists operators inside of `splitting_ops` that have an in-place mutation. In
+# vllm, this specifically occurs on the operator
+# vllm.unified_attention_with_output. In this case, inductor does not populate
+# the inductor IR's `origin_node` field, causing an assertion error when trying
+# to access the node's `origin_node` field.
+#
+# So, we will monkeypatch torch._inductor.scheduler.Scheduler.should_partition
+# so that it does not access the inductor IR node's `origin_node` field and just
+# returns True if a node is registered as having a custom partition function.
+# This is ok for now since vllm's implementation of the custom partition
+# functions just return True.
+# ========================================
+
+
+def should_partition_patched(self, node, should_log: bool = False) -> bool:
+    # This is a patched version of
+    # torch._inductor.scheduler.Scheduler.should_partition that modifies
+    # the following piece of code so that we always return True:
+    # https://github.com/pytorch/pytorch/blob/ecb53078faf86ca1b33277df33b82985675bb011/torch/_inductor/scheduler.py#L4712-L4724
+    """Return True if we should partition the inductor graph on this node"""
+
+    import torch._inductor.ir as ir
+    from torch._inductor.scheduler import (
+        BaseSchedulerNode,
+        FusedSchedulerNode,
+        _custom_should_partition_fns,
+    )
+    from torch._inductor.utils import (
+        _unstable_customized_partition_wrapper,
+        is_cudagraph_unsafe_op,
+        maybe_log_cudagraph_partition,
+    )
+
+    # Allow users to manually specify if a node should be partitioned
+    # Can only do this for FallbackKernels
+    ir_node = node.node
+    if isinstance(ir_node, ir.FallbackKernel):
+        operator = ir_node.op_overload
+        if operator is not None and operator in _custom_should_partition_fns:
+            return True
+
+    # When not using cudagraphs, keep all kernels in the `call` function
+    # instead of graph partition functions, since graph partition only brings
+    # benefit to cudagraph
+    if (
+        not torch._inductor.config.triton.cudagraphs
+        and _unstable_customized_partition_wrapper.wrapper is None
+    ):
+        return True
+
+    # avoid duplicating logs when should_partition is called multiple times
+    # on the same node
+    def noop_log(msg: str, node: BaseSchedulerNode | None) -> None:
+        return
+
+    log_partition_reason = maybe_log_cudagraph_partition if should_log else noop_log
+
+    if isinstance(node, FusedSchedulerNode):
+        return any(self.should_partition(snode) for snode in node.snodes)
+
+    assert node.node is not None
+
+    if not node.is_gpu():
+        log_partition_reason("non gpu ops", node=node)
+
+        return True
+
+    if isinstance(node.node, ir.DeviceCopy):
+        log_partition_reason("DeviceCopy ops", node=node)
+        return True
+
+    if isinstance(node.node, ir.Conditional):
+        log_partition_reason("Conditional ops", node=node)
+        return True
+
+    if getattr(node.node, "unbacked_bindings", None):
+        log_partition_reason("unbacked binding ops", node=node)
+        return True
+
+    if is_cudagraph_unsafe_op(node.node):
+        log_partition_reason("CUDAGraph-unsafe custom ops", node=node)
+        return True
+
+    return False
+
+
+def _update_scheduler_patched(self) -> None:
+    # Copied from torch._inductor.graph.GrahLowering._update_scheduler. Patches
+    # this method so that we can patch Scheduler.should_partition with the
+    # function above
+    """
+    (Re)initializes the scheduler member.  When initializing the scheduler, no CUBIN
+    files should be generated (to avoid biasing any benchmarks and pessimizing
+    fusion decisions).
+    """
+    import torch._inductor.config as config
+    from torch._inductor.scheduler import Scheduler
+
+    Scheduler.should_partition = should_partition_patched
+
+    with config.patch("triton.store_cubin", False):
+        self.scheduler = Scheduler(self.operations)
+
+
+if version.parse(str(torch.__version__)) == version.parse("2.9.0"):
+    from torch._inductor.graph import GraphLowering
+
+    GraphLowering._update_scheduler = _update_scheduler_patched

From 8c851f6d044bf7922122a1735e57aea727e30d45 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 15 Oct 2025 13:38:36 +0800
Subject: [PATCH 18/51] [Bugfix] Fix qwen3-omni audio truncation issue (#26815)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 .../models/qwen3_omni_moe_thinker.py           | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index d565a0108432a..d5a75e75aa43e 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -30,7 +30,9 @@ import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from packaging.version import Version
 from transformers import PretrainedConfig
+from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import (
     Qwen3OmniMoeConfig,
@@ -711,11 +713,12 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
             return x
 
         # NOTE: WhisperFeatureExtractor cannot handle empty list of audios
+        feature_extractor = self.info.get_feature_extractor()
+        hop_length = feature_extractor.hop_length
         if audios:
             # NOTE: Qwen3-Omni processor accept "audio"
             # To make sure the cache works with padding=True, we pre-padded
             # the audio to multiple of hop_length.
-            hop_length = self.info.get_feature_extractor().hop_length
             mm_data["audio"] = [
                 pad_to_hop_length(audio, hop_length)
                 if isinstance(audio, np.ndarray)
@@ -725,6 +728,14 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
             mm_kwargs = dict(
                 **mm_kwargs,
             )
+            # TODO(Isotr0py): Remove this patch after upstream fix PR
+            # released and Transformers version update:
+            # https://github.com/huggingface/transformers/pull/41473
+            if (
+                Version(TRANSFORMERS_VERSION) < Version("4.58.0")
+                and "truncation" not in mm_kwargs
+            ):
+                mm_kwargs["truncation"] = False
 
         hf_inputs = super()._call_hf_processor(
             prompt=prompt,
@@ -738,7 +749,6 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
             and "feature_attention_mask" in hf_inputs
             and (audios := mm_data.get("audio", []))
         ):
-            hop_length = self.info.get_feature_extractor().hop_length
             audio_num_frames = []
             for _, audio in enumerate(audios):
                 audio_length = len(audio[0]) if isinstance(audio, tuple) else len(audio)
@@ -747,6 +757,10 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
                     if audio_length % hop_length == 0
                     else (audio_length // hop_length - 1)
                 )
+                if mm_kwargs.get("truncation", False):
+                    num_frame = min(
+                        num_frame, feature_extractor.n_samples // hop_length
+                    )
                 audio_num_frames.append(num_frame)
             hf_inputs["feature_attention_mask"] = [
                 torch.ones(num_frame) for num_frame in audio_num_frames

From f0862eae43b9219f000a96e34ee00617600dac57 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 14 Oct 2025 23:39:48 -0700
Subject: [PATCH 19/51] [Graph Partition] pass tests for decorator (#26831)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 tests/compile/test_decorator.py | 100 +++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 26 deletions(-)

diff --git a/tests/compile/test_decorator.py b/tests/compile/test_decorator.py
index 4d60899a628a9..e459bc539f2b8 100644
--- a/tests/compile/test_decorator.py
+++ b/tests/compile/test_decorator.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
 import torch
 from torch import nn
 
@@ -14,6 +15,7 @@ from vllm.config import (
     set_current_vllm_config,
 )
 from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils import is_torch_equal_or_newer
 
 # This import automatically registers `torch.ops.silly.attention`
 from . import silly_attention  # noqa: F401
@@ -65,19 +67,40 @@ def run_model(
         return output.cpu()
 
 
-def test_ignore_torch_compile_decorator():
-    # vllmcompile
+@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+def test_ignore_torch_compile_decorator(use_inductor_graph_partition, monkeypatch):
+    # disable compile cache so that we can count the number of compilations
+    # appropriately
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
+
+    # piecewise
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
-            use_inductor_graph_partition=False,  # TODO test both?
+            use_inductor_graph_partition=use_inductor_graph_partition,
         )
     )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
 
+    expected_num_graphs_seen = 1
+    expected_num_cudagraph_captured = (
+        4  # num_cudagraph_sizes * num cudagraphs to capture
+    )
+    if use_inductor_graph_partition:
+        expected_num_piecewise_graphs_seen = 1
+        expected_num_piecewise_capturable_graphs_seen = 1
+        expected_num_backend_compilations = 1
+    else:
+        expected_num_piecewise_graphs_seen = 3
+        expected_num_piecewise_capturable_graphs_seen = 2
+        expected_num_backend_compilations = 2
+
     @support_torch_compile
     class A(nn.Module):
         def __init__(
@@ -104,12 +127,11 @@ def test_ignore_torch_compile_decorator():
 
     # A has support_torch_compile
     with compilation_counter.expect(
-        num_graphs_seen=1,
-        num_piecewise_graphs_seen=3,
-        num_piecewise_capturable_graphs_seen=2,
-        num_backend_compilations=2,
-        num_cudagraph_captured=4,
-        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_graphs_seen=expected_num_graphs_seen,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
+        num_cudagraph_captured=expected_num_cudagraph_captured,
     ):
         run_model(vllm_config, mod_A, cudagraph_runtime_mode)
 
@@ -131,12 +153,11 @@ def test_ignore_torch_compile_decorator():
 
     # C's support_torch_compile should override B's ignore_torch_compile
     with compilation_counter.expect(
-        num_graphs_seen=1,
-        num_piecewise_graphs_seen=3,
-        num_piecewise_capturable_graphs_seen=2,
-        num_backend_compilations=2,
-        num_cudagraph_captured=4,
-        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_graphs_seen=expected_num_graphs_seen,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
+        num_cudagraph_captured=expected_num_cudagraph_captured,
     ):
         run_model(vllm_config, mod_C, cudagraph_runtime_mode)
 
@@ -179,7 +200,15 @@ class A(nn.Module):
         return x
 
 
-def test_conditional_compile_enable_if():
+@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch):
+    # disable compile cache so that we can count the number of compilations
+    # appropriately
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
+
     vllm_config = VllmConfig(
         cache_config=CacheConfig(
             kv_sharing_fast_prefill=True,
@@ -189,7 +218,7 @@ def test_conditional_compile_enable_if():
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
-            use_inductor_graph_partition=False,  # TODO test both
+            use_inductor_graph_partition=use_inductor_graph_partition,
         ),
     )
     cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -197,17 +226,26 @@ def test_conditional_compile_enable_if():
     with set_current_vllm_config(vllm_config):
         mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
 
+    if use_inductor_graph_partition:
+        expected_num_piecewise_graphs_seen = 2
+        expected_num_piecewise_capturable_graphs_seen = 2
+        expected_num_backend_compilations = 2
+    else:
+        expected_num_piecewise_graphs_seen = 6
+        expected_num_piecewise_capturable_graphs_seen = 4
+        expected_num_backend_compilations = 4
+
     # A has support_torch_compile but enable_if fn returns False
     # enalbe_if will be True for B, so we expect mod1 and mod2
     # to be compiled
     with compilation_counter.expect(
         num_graphs_seen=2,
-        num_piecewise_graphs_seen=6,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
         # 3 piecewise graphs per instance of B()
-        num_piecewise_capturable_graphs_seen=4,
-        num_backend_compilations=4,
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
         num_cudagraph_captured=8,
-        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        # num_cudagraph_sizes * num cudagraphable graphs to capture
     ):
         run_model(vllm_config, mod_A, cudagraph_runtime_mode)
 
@@ -222,20 +260,30 @@ def test_conditional_compile_enable_if():
             use_cudagraph=True,
             splitting_ops=["silly::attention"],
             cudagraph_capture_sizes=[1, 2],
-            use_inductor_graph_partition=False,  # TODO test both?
+            use_inductor_graph_partition=use_inductor_graph_partition,
         ),
     )
 
     with set_current_vllm_config(vllm_config):
         mod_A = A(vllm_config=vllm_config, prefix="").eval().cuda()
 
+    if use_inductor_graph_partition:
+        expected_num_piecewise_graphs_seen = 1
+        expected_num_piecewise_capturable_graphs_seen = 1
+        expected_num_backend_compilations = 1
+    else:
+        # 3 attn ops and 4 non-attn ops
+        expected_num_piecewise_graphs_seen = 7
+        expected_num_piecewise_capturable_graphs_seen = 4
+        expected_num_backend_compilations = 4
+
     with compilation_counter.expect(
         num_graphs_seen=1,
-        num_piecewise_graphs_seen=7,
+        num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen,
         # 3 attn ops and 4 non-attn ops
-        num_piecewise_capturable_graphs_seen=4,
-        num_backend_compilations=4,
+        num_piecewise_capturable_graphs_seen=expected_num_piecewise_capturable_graphs_seen,
+        num_backend_compilations=expected_num_backend_compilations,
         num_cudagraph_captured=8,
-        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        # num_cudagraph_sizes * num cudagraphable graphs to capture
     ):
         run_model(vllm_config, mod_A, cudagraph_runtime_mode)

From 8865da157bea1b25afc4ff28031858d557363290 Mon Sep 17 00:00:00 2001
From: "sangho.lee" <sanghol@allenai.org>
Date: Wed, 15 Oct 2025 02:13:59 -0500
Subject: [PATCH 20/51] [Bugfix][Multi Modal] Fix incorrect Molmo token
 processing (#26873)

Signed-off-by: sanghol <sanghol@allenai.org>
---
 vllm/model_executor/models/molmo.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 106aaf413e99b..dce94d181c4cd 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1264,13 +1264,16 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
     ) -> list[int]:
         processor = self.info.get_hf_processor()
 
-        # Apply the chat template to the tokens
+        # The chat template is already applied to the prompt tokens
+        # Use message_format="none" to avoid applying it again
+        # Prepend an empty space if `always_start_with_space` is True
         tokens = processor.processor.get_tokens_input(  # type: ignore
             self.info.get_tokenizer().decode(prompt_tokens),
-            message_format=processor.message_format,
+            message_format="none",
             always_start_with_space=processor.always_start_with_space,
         )
 
+        # Prepend a BOS token id to the tokens
         processed_data = self.info.ctx.call_hf_processor(
             processor,  # type: ignore
             dict(tokens=tokens),

From 302ef403a2305e9158064f8e386d1b5284d12cb2 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Wed, 15 Oct 2025 15:16:44 +0800
Subject: [PATCH 21/51] [DSA][MLA] Tiny refactor on DeepSeek to make it
 reusable for different backends (#26656)

Signed-off-by: MengqingCao <cmq0113@163.com>
---
 vllm/attention/layer.py                    |  2 ++
 vllm/model_executor/models/deepseek_mtp.py | 10 ++++++++--
 vllm/model_executor/models/deepseek_v2.py  |  3 ++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 8b5b87cba4044..16c5799f7d0be 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -587,6 +587,7 @@ class MLAAttention(nn.Module, AttentionLayerBase):
         prefix: str = "",
         use_sparse: bool = False,
         indexer: object | None = None,
+        **extra_impl_args,
     ):
         super().__init__()
         self.num_heads = num_heads
@@ -639,6 +640,7 @@ class MLAAttention(nn.Module, AttentionLayerBase):
             v_head_dim=self.v_head_dim,
             kv_b_proj=kv_b_proj,
             indexer=indexer,
+            **extra_impl_args,
         )
 
         self.use_direct_call = not current_platform.opaque_attention_op()
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index de80833130179..576977b00e616 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -17,9 +17,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
-from .deepseek_v2 import DeepseekV2DecoderLayer, get_spec_layer_idx_from_weight_name
+from .deepseek_v2 import (
+    DeepseekV2DecoderLayer,
+    get_spec_layer_idx_from_weight_name,
+)
 from .interfaces import SupportsPP
 from .utils import maybe_prefix
 
@@ -56,6 +60,8 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
         self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
 
+        self.device = current_platform.device_type
+
         self.is_v32 = hasattr(config, "index_topk")
         if self.is_v32:
             topk_tokens = config.index_topk
@@ -63,7 +69,7 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
                 vllm_config.scheduler_config.max_num_batched_tokens,
                 topk_tokens,
                 dtype=torch.int32,
-                device="cuda",
+                device=self.device,
             )
         else:
             topk_indices_buffer = None
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 970fa80826aba..3d26327c732ea 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1165,6 +1165,7 @@ class DeepseekV2Model(nn.Module):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
+        self.device = current_platform.device_type
 
         self.vocab_size = config.vocab_size
         self.is_v32 = hasattr(config, "index_topk")
@@ -1174,7 +1175,7 @@ class DeepseekV2Model(nn.Module):
                 vllm_config.scheduler_config.max_num_batched_tokens,
                 topk_tokens,
                 dtype=torch.int32,
-                device="cuda",
+                device=self.device,
             )
         else:
             topk_indices_buffer = None

From b8a45721576e8cc0a0a6b8f82e7aec423068dc64 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 15 Oct 2025 15:17:37 +0800
Subject: [PATCH 22/51] [Misc] Use helper function to generate dummy messages
 in OpenAI MM tests (#26875)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_audio.py  |  71 +++++++----------
 tests/entrypoints/openai/test_video.py  |  99 ++++++++---------------
 tests/entrypoints/openai/test_vision.py | 101 ++++++++----------------
 3 files changed, 93 insertions(+), 178 deletions(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index a96f0134c2ffb..a2d8993441fcd 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -53,21 +53,34 @@ def base64_encoded_audio() -> dict[str, str]:
     }
 
 
+def dummy_messages_from_audio_url(
+    audio_urls: str | list[str],
+    content_text: str = "What's happening in this audio?",
+):
+    if isinstance(audio_urls, str):
+        audio_urls = [audio_urls]
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "audio_url", "audio_url": {"url": audio_url}}
+                    for audio_url in audio_urls
+                ),
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
 async def test_single_chat_session_audio(
     client: openai.AsyncOpenAI, model_name: str, audio_url: str
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "audio_url", "audio_url": {"url": audio_url}},
-                {"type": "text", "text": "What's happening in this audio?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_audio_url(audio_url)
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -138,20 +151,9 @@ async def test_single_chat_session_audio_base64encoded(
     audio_url: str,
     base64_encoded_audio: dict[str, str],
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"  # noqa: E501
-                    },
-                },
-                {"type": "text", "text": "What's happening in this audio?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_audio_url(
+        f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
+    )
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -252,15 +254,7 @@ async def test_single_chat_session_input_audio(
 async def test_chat_streaming_audio(
     client: openai.AsyncOpenAI, model_name: str, audio_url: str
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "audio_url", "audio_url": {"url": audio_url}},
-                {"type": "text", "text": "What's happening in this audio?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_audio_url(audio_url)
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -365,18 +359,7 @@ async def test_chat_streaming_input_audio(
 async def test_multi_audio_input(
     client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                *(
-                    {"type": "audio_url", "audio_url": {"url": audio_url}}
-                    for audio_url in audio_urls
-                ),
-                {"type": "text", "text": "What's happening in this audio?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_audio_url(audio_urls)
 
     if len(audio_urls) > MAXIMUM_AUDIOS:
         with pytest.raises(openai.BadRequestError):  # test multi-audio input
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 4c7d1c14ca17b..7ecdac518f97f 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -55,21 +55,34 @@ def base64_encoded_video() -> dict[str, str]:
     }
 
 
+def dummy_messages_from_video_url(
+    video_urls: str | list[str],
+    content_text: str = "What's in this video?",
+):
+    if isinstance(video_urls, str):
+        video_urls = [video_urls]
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "video_url", "video_url": {"url": video_url}}
+                    for video_url in video_urls
+                ),
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 async def test_single_chat_session_video(
     client: openai.AsyncOpenAI, model_name: str, video_url: str
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "video_url", "video_url": {"url": video_url}},
-                {"type": "text", "text": "What's in this video?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_video_url(video_url)
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -137,15 +150,7 @@ async def test_error_on_invalid_video_url_type(
 async def test_single_chat_session_video_beamsearch(
     client: openai.AsyncOpenAI, model_name: str, video_url: str
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "video_url", "video_url": {"url": video_url}},
-                {"type": "text", "text": "What's in this video?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_video_url(video_url)
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -172,20 +177,9 @@ async def test_single_chat_session_video_base64encoded(
     video_url: str,
     base64_encoded_video: dict[str, str],
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "video_url",
-                    "video_url": {
-                        "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"  # noqa: E501
-                    },
-                },
-                {"type": "text", "text": "What's in this video?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_video_url(
+        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+    )
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -231,20 +225,10 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
     video_url: str,
     base64_encoded_video: dict[str, str],
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "video_url",
-                    "video_url": {
-                        "url": f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"  # noqa: E501
-                    },
-                },
-                {"type": "text", "text": "What's in this video?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_video_url(
+        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+    )
+
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
@@ -265,15 +249,7 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
 async def test_chat_streaming_video(
     client: openai.AsyncOpenAI, model_name: str, video_url: str
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "video_url", "video_url": {"url": video_url}},
-                {"type": "text", "text": "What's in this video?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_video_url(video_url)
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -318,18 +294,7 @@ async def test_chat_streaming_video(
 async def test_multi_video_input(
     client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                *(
-                    {"type": "video_url", "video_url": {"url": video_url}}
-                    for video_url in video_urls
-                ),
-                {"type": "text", "text": "What's in this video?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_video_url(video_urls)
 
     if len(video_urls) > MAXIMUM_VIDEOS:
         with pytest.raises(openai.BadRequestError):  # test multi-video input
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 5a15a352f45cc..09bd0dabb799a 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -78,6 +78,27 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
     }
 
 
+def dummy_messages_from_image_url(
+    image_urls: str | list[str],
+    content_text: str = "What's in this image?",
+):
+    if isinstance(image_urls, str):
+        image_urls = [image_urls]
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                *(
+                    {"type": "image_url", "image_url": {"url": image_url}}
+                    for image_url in image_urls
+                ),
+                {"type": "text", "text": content_text},
+            ],
+        }
+    ]
+
+
 def get_hf_prompt_tokens(model_name, content, image_url):
     processor = AutoProcessor.from_pretrained(
         model_name, trust_remote_code=True, num_crops=4
@@ -107,15 +128,7 @@ async def test_single_chat_session_image(
     client: openai.AsyncOpenAI, model_name: str, image_url: str
 ):
     content_text = "What's in this image?"
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image_url", "image_url": {"url": image_url}},
-                {"type": "text", "text": content_text},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_image_url(image_url, content_text)
 
     max_completion_tokens = 10
     # test single completion
@@ -188,15 +201,8 @@ async def test_error_on_invalid_image_url_type(
 async def test_single_chat_session_image_beamsearch(
     client: openai.AsyncOpenAI, model_name: str, image_url: str
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image_url", "image_url": {"url": image_url}},
-                {"type": "text", "text": "What's in this image?"},
-            ],
-        }
-    ]
+    content_text = "What's in this image?"
+    messages = dummy_messages_from_image_url(image_url, content_text)
 
     chat_completion = await client.chat.completions.create(
         model=model_name,
@@ -226,20 +232,10 @@ async def test_single_chat_session_image_base64encoded(
     base64_encoded_image: dict[str, str],
 ):
     content_text = "What's in this image?"
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"  # noqa: E501
-                    },
-                },
-                {"type": "text", "text": content_text},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_image_url(
+        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
+        content_text,
+    )
 
     max_completion_tokens = 10
     # test single completion
@@ -293,20 +289,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
     raw_image_url = TEST_IMAGE_ASSETS[image_idx]
     expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
 
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"  # noqa: E501
-                    },
-                },
-                {"type": "text", "text": "What's in this image?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_image_url(
+        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
+    )
+
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
@@ -326,15 +312,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
 async def test_chat_streaming_image(
     client: openai.AsyncOpenAI, model_name: str, image_url: str
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image_url", "image_url": {"url": image_url}},
-                {"type": "text", "text": "What's in this image?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_image_url(image_url)
 
     # test single completion
     chat_completion = await client.chat.completions.create(
@@ -381,18 +359,7 @@ async def test_chat_streaming_image(
 async def test_multi_image_input(
     client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
 ):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                *(
-                    {"type": "image_url", "image_url": {"url": image_url}}
-                    for image_url in image_urls
-                ),
-                {"type": "text", "text": "What's in this image?"},
-            ],
-        }
-    ]
+    messages = dummy_messages_from_image_url(image_urls)
 
     if len(image_urls) > MAXIMUM_IMAGES:
         with pytest.raises(openai.BadRequestError):  # test multi-image input

From efdef57b1fefab62db76ff31053de9c185dbaf66 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Wed, 15 Oct 2025 00:47:50 -0700
Subject: [PATCH 23/51] [bugfix] Lazy import cv2 (#26869)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 vllm/assets/video.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index a4e67ca0b63e3..277c8ea1bf0d7 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -5,7 +5,6 @@ from dataclasses import dataclass
 from functools import lru_cache
 from typing import Any, ClassVar, Literal
 
-import cv2
 import numpy as np
 import numpy.typing as npt
 from huggingface_hub import hf_hub_download
@@ -43,6 +42,8 @@ def download_video_asset(filename: str) -> str:
 
 
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
+    import cv2
+
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
         raise ValueError(f"Could not open video file {path}")
@@ -78,6 +79,8 @@ def video_to_pil_images_list(path: str, num_frames: int = -1) -> list[Image.Imag
 
 
 def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
+    import cv2
+
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
         raise ValueError(f"Could not open video file {path}")

From f5ed68ef63d0c3c084688fe00b3aeb1996ca0b6f Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Wed, 15 Oct 2025 04:05:01 -0400
Subject: [PATCH 24/51] [Deepseek-V3.2][Kernel] Integrate cuda indexer k cache
 gather (#26456)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 74 ++---------------------
 1 file changed, 6 insertions(+), 68 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 3d26327c732ea..f33ed735f4291 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -75,7 +75,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.models.utils import sequence_parallel_chunk
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import cdiv, direct_register_custom_op
+from vllm.utils import direct_register_custom_op
 from vllm.utils.deep_gemm import fp8_mqa_logits, fp8_paged_mqa_logits
 from vllm.v1.attention.backends.mla.indexer import (
     DeepseekV32IndexerBackend,
@@ -483,69 +483,6 @@ class DeepseekV32IndexerCache(torch.nn.Module, AttentionLayerBase):
         return DeepseekV32IndexerBackend
 
 
-@torch.inference_mode()
-def cp_gather_indexer_k_quant_cache(
-    kv_cache,  # [num_blocks, block_size, head_dim + 1]
-    dst_value,  # [cu_seq_lens[-1], head_dim]
-    dst_scale,  # [cu_seq_lens[-1], 4]
-    block_table,  # [batch_size, num_blocks]
-    cu_seq_lens,  # [batch_size + 1, ]
-    batch_size,
-):
-    num_blocks, block_size, _ = kv_cache.shape
-    head_dim = dst_value.shape[-1]
-    kv_cache = kv_cache.view(num_blocks, -1)
-
-    expected_value = []
-    expected_scale = []
-    for b in range(batch_size):
-        s = cu_seq_lens[b + 1] - cu_seq_lens[b]
-        if s == 0:
-            continue
-        tot = cdiv(s, block_size)
-        blocks = block_table[b, :tot]
-
-        value = []
-        scale = []
-        full_block = torch.arange(tot - 1, device=kv_cache.device, dtype=torch.int32)
-        non_remaining_value = kv_cache[
-            blocks[full_block], : block_size * head_dim
-        ].view(-1, head_dim)
-        non_remaining_scale = kv_cache[
-            blocks[full_block], block_size * head_dim :
-        ].view(-1, 4)
-
-        remaining = s - (tot - 1) * block_size
-
-        value = torch.cat(
-            [
-                non_remaining_value,
-                kv_cache[blocks[-1], : remaining * head_dim].view(-1, head_dim),
-            ],
-            dim=0,
-        )
-        scale = torch.cat(
-            [
-                non_remaining_scale,
-                kv_cache[
-                    blocks[-1],
-                    block_size * head_dim : block_size * head_dim + remaining * 4,
-                ].view(-1, 4),
-            ],
-            dim=0,
-        )
-
-        expected_value.append(value)
-        expected_scale.append(scale)
-
-    gather_value = torch.cat(expected_value, dim=0).view(-1, head_dim)
-    gather_scale = torch.cat(expected_scale, dim=0).view(-1, 4)
-    gather_value = gather_value.view(torch.float8_e4m3fn)
-    gather_scale = gather_scale.view(torch.float32)
-    dst_value.copy_(gather_value)
-    dst_scale.copy_(gather_scale)
-
-
 def sparse_attn_indexer(
     hidden_states: torch.Tensor,
     k_cache_prefix: str,
@@ -605,19 +542,20 @@ def sparse_attn_indexer(
                 dtype=torch.float8_e4m3fn,
             )
             k_scale = torch.empty(
-                [chunk.total_seq_lens, 1], device=k.device, dtype=torch.float32
+                [chunk.total_seq_lens, 4],
+                device=k.device,
+                dtype=torch.uint8,
             )
-            cp_gather_indexer_k_quant_cache(
+            ops.cp_gather_indexer_k_quant_cache(
                 kv_cache,
                 k_fp8,
                 k_scale,
                 chunk.block_table,
                 chunk.cu_seq_lens,
-                chunk.num_reqs,
             )
             logits = fp8_mqa_logits(
                 q_fp8[chunk.token_start : chunk.token_end],
-                (k_fp8, k_scale),
+                (k_fp8, k_scale.view(torch.float32)),
                 weights[chunk.token_start : chunk.token_end],
                 chunk.cu_seqlen_ks,
                 chunk.cu_seqlen_ke,

From f3c378ffa7f95317497a2cf64ac52b09a8708bc9 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Wed, 15 Oct 2025 01:09:56 -0700
Subject: [PATCH 25/51] [CI/Build] Add Qwen2.5-VL-7B-Instruct ChartQA Accuracy
 Tests in CI (#21810)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: zhewenli <zhewenli@meta.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Ye (Charlotte) Qi <ye.charlotte.qi@gmail.com>
---
 .../configs/Meta-Llama-3-8B-QQQ.yaml          | 12 +++++
 ...a-4-Maverick-17B-128E-Instruct-FP8-MM.yaml | 11 ++++
 ...lama-4-Maverick-17B-128E-Instruct-FP8.yaml | 11 ++++
 .../Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml   |  3 +-
 .../configs/Qwen2.5-VL-7B-Instruct.yaml       | 12 +++++
 .../configs/models-large-h100.txt             |  1 +
 .../configs/models-mm-large-h100.txt          |  1 +
 .../configs/models-mm-small.txt               |  1 +
 .../run-lm-eval-chartqa-vllm-vlm-baseline.sh  | 44 ++++++++++++++++
 .../run-lm-eval-gsm-hf-baseline.sh            |  0
 .../run-lm-eval-mmlupro-vllm-baseline.sh      | 50 +++++++++++++++++++
 .../test_lm_eval_correctness.py               | 12 +++--
 .buildkite/test-pipeline.yaml                 | 10 ++++
 13 files changed, 164 insertions(+), 4 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
 create mode 100644 .buildkite/lm-eval-harness/configs/models-large-h100.txt
 create mode 100644 .buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
 create mode 100644 .buildkite/lm-eval-harness/configs/models-mm-small.txt
 create mode 100755 .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
 mode change 100644 => 100755 .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
 create mode 100644 .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
new file mode 100644
index 0000000000000..56ec933c9cc0e
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.419
+  - name: "exact_match,flexible-extract"
+    value: 0.416
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
new file mode 100644
index 0000000000000..f10b937249975
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@@ -0,0 +1,11 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 100 -t 8
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    value: 0.90
+limit: 100
+num_fewshot: 0
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
new file mode 100644
index 0000000000000..96eeed04a9dc0
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@@ -0,0 +1,11 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -b 32 -l 250 -t 8 -f 5
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+backend: "vllm-vlm"
+tasks:
+- name: "mmlu_pro"
+  metrics:
+  - name: "exact_match,custom-extract"
+    value: 0.80
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
index a2f235f485815..aa4fb9fa03d6d 100644
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+# For vllm script, with -t option (tensor parallel size)
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
new file mode 100644
index 0000000000000..5f3c31743e75b
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
+
+model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    value: 0.855
+limit: 2500
+num_fewshot: 0
diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
new file mode 100644
index 0000000000000..4fb0b84bc4d81
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
new file mode 100644
index 0000000000000..91e22b6459c12
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-mm-small.txt b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
new file mode 100644
index 0000000000000..1097d220245fc
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@@ -0,0 +1 @@
+Qwen2.5-VL-7B-Instruct.yaml
\ No newline at end of file
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
new file mode 100755
index 0000000000000..c8db951381b0b
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on chartqa for vllm.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.9
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on ChartQA using multimodal vllm."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our correctness tests in vllm's CI."
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:l:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    t ) 
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm-vlm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
+  --tasks chartqa \
+  --batch_size auto \
+  --apply_chat_template \
+  --limit $LIMIT
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
old mode 100644
new mode 100755
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
new file mode 100644
index 0000000000000..d85a1721db9a5
--- /dev/null
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m )
+        MODEL="$OPTARG"
+        ;;
+    b )
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l )
+        LIMIT="$OPTARG"
+        ;;
+    f )
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size auto
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index ceea01166b7f4..f10de82b1d8e8 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -19,21 +19,27 @@ RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
     trust_remote_code = eval_config.get("trust_remote_code", False)
     max_model_len = eval_config.get("max_model_len", 4096)
+    batch_size = eval_config.get("batch_size", "auto")
+    backend = eval_config.get("backend", "vllm")
     model_args = (
         f"pretrained={eval_config['model_name']},"
         f"tensor_parallel_size={tp_size},"
         f"enforce_eager=true,"
         f"add_bos_token=true,"
         f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len}"
+        f"max_model_len={max_model_len},"
     )
     results = lm_eval.simple_evaluate(
-        model="vllm",
+        model=backend,
         model_args=model_args,
         tasks=[task["name"] for task in eval_config["tasks"]],
         num_fewshot=eval_config["num_fewshot"],
         limit=eval_config["limit"],
-        batch_size="auto",
+        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+        # text models. however, this is regressing measured strict-match for
+        # existing text models in CI, so only apply it for mm.
+        apply_chat_template=backend == "vllm-vlm",
+        batch_size=batch_size,
     )
     return results
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 94c0944c838ce..a8a5bf3ad234d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -734,6 +734,16 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
 - label: Multi-Modal Models Test (Extended) 1
   mirror_hardwares: [amdexperimental]
   optional: true

From 71557a5f7c221b63759a0d87c0b175b1bab243e6 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 15 Oct 2025 04:23:33 -0400
Subject: [PATCH 26/51] [CI] Fix mypy for `vllm/executor` (#26845)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 tools/pre_commit/mypy.py                  |  2 +-
 vllm/executor/executor_base.py            |  7 ++++---
 vllm/executor/ray_distributed_executor.py | 18 ++++++++++++------
 vllm/executor/ray_utils.py                |  7 ++++++-
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 7fdfdb37a0c0f..a3aa546347255 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -28,6 +28,7 @@ FILES = [
     "vllm/assets",
     "vllm/distributed",
     "vllm/entrypoints",
+    "vllm/executor",
     "vllm/inputs",
     "vllm/logging_utils",
     "vllm/multimodal",
@@ -44,7 +45,6 @@ SEPARATE_GROUPS = [
     "vllm/attention",
     "vllm/compilation",
     "vllm/engine",
-    "vllm/executor",
     "vllm/inputs",
     "vllm/lora",
     "vllm/model_executor",
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 2c44422ba2178..a5f83f9040023 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -18,7 +18,7 @@ from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.tasks import SupportedTask
 from vllm.utils import make_async
-from vllm.v1.outputs import PoolerOutput, SamplerOutput
+from vllm.v1.outputs import SamplerOutput
 from vllm.v1.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
@@ -54,7 +54,7 @@ class ExecutorBase(ABC):
         self._init_executor()
         self.is_sleeping = False
         self.sleeping_tags: set[str] = set()
-        self.kv_output_aggregator = None
+        self.kv_output_aggregator: KVOutputAggregator | None = None
 
     @abstractmethod
     def _init_executor(self) -> None:
@@ -143,8 +143,9 @@ class ExecutorBase(ABC):
 
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
-    ) -> list[SamplerOutput | PoolerOutput] | None:
+    ) -> list[SamplerOutput]:
         output = self.collective_rpc("execute_model", args=(execute_model_req,))
+        assert output[0] is not None
         return output[0]
 
     def stop_remote_worker_execution_loop(self) -> None:
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 943c6a27f1e8f..59e282ac92b6d 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -217,7 +217,9 @@ class RayDistributedExecutor(DistributedExecutorBase):
                     num_gpus=num_gpus,
                     scheduling_strategy=scheduling_strategy,
                     **ray_remote_kwargs,
-                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, rpc_rank=rank)
+                )(RayWorkerWrapper).remote(  # type: ignore[attr-defined]
+                    vllm_config=self.vllm_config, rpc_rank=rank
+                )
             else:
                 worker = ray.remote(
                     num_cpus=0,
@@ -225,7 +227,9 @@ class RayDistributedExecutor(DistributedExecutorBase):
                     resources={current_platform.ray_device_key: num_gpus},
                     scheduling_strategy=scheduling_strategy,
                     **ray_remote_kwargs,
-                )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, rpc_rank=rank)
+                )(RayWorkerWrapper).remote(  # type: ignore[attr-defined]
+                    vllm_config=self.vllm_config, rpc_rank=rank
+                )
             worker_metadata.append(RayWorkerMetaData(worker=worker, created_rank=rank))
 
         worker_ips = ray.get(
@@ -303,7 +307,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
                 continue
             worker_node_and_gpu_ids.append(
                 ray.get(worker.get_node_and_gpu_ids.remote())
-            )  # type: ignore
+            )  # type: ignore[attr-defined]
 
         node_workers = defaultdict(list)  # node id -> list of worker ranks
         node_gpus = defaultdict(list)  # node id -> list of gpu ids
@@ -495,7 +499,9 @@ class RayDistributedExecutor(DistributedExecutorBase):
         if async_run_tensor_parallel_workers_only:
             ray_workers = self.non_driver_workers
         ray_worker_outputs = [
-            worker.execute_method.remote(sent_method, *args, **kwargs)
+            worker.execute_method.remote(  # type: ignore[attr-defined]
+                sent_method, *args, **kwargs
+            )
             for worker in ray_workers
         ]
 
@@ -715,7 +721,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
             tasks.append(
                 asyncio.create_task(
                     _run_task_with_lock(
-                        driver_worker.execute_method.remote,
+                        driver_worker.execute_method.remote,  # type: ignore[attr-defined]
                         self.pp_locks[pp_rank],
                         "execute_model",
                         execute_model_req,
@@ -733,7 +739,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
             "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1"
         )
         coros = [
-            worker.execute_method.remote("start_worker_execution_loop")
+            worker.execute_method.remote("start_worker_execution_loop")  # type: ignore[attr-defined]
             for worker in self.non_driver_workers
         ]
         return await asyncio.gather(*coros)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index d12151bb9485a..ef5a99659f30e 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -90,14 +90,17 @@ try:
 
             execute_model_req = self.input_decoder.decode(serialized_req)
 
+            assert self.worker is not None, "Worker is not initialized"
+
             # TODO(swang): This is needed right now because Ray Compiled Graph
             # executes on a background thread, so we need to reset torch's
             # current device.
             if not self.compiled_dag_cuda_device_set:
+                assert self.worker.device is not None
                 current_platform.set_device(self.worker.device)
                 self.compiled_dag_cuda_device_set = True
 
-            output = self.worker._execute_model_spmd(
+            output = self.worker._execute_model_spmd(  # type: ignore[attr-defined]
                 execute_model_req, intermediate_tensors
             )
             # Pipeline model request and output to the next pipeline stage.
@@ -119,6 +122,7 @@ try:
                     # Not needed
                     pass
                 else:
+                    assert self.worker.device is not None
                     current_platform.set_device(self.worker.device)
 
                 self.compiled_dag_cuda_device_set = True
@@ -139,6 +143,7 @@ try:
                 scheduler_output, intermediate_tensors = scheduler_output
             else:
                 scheduler_output, intermediate_tensors = scheduler_output, None
+            assert self.worker.model_runner is not None
             output = self.worker.model_runner.execute_model(
                 scheduler_output, intermediate_tensors
             )

From 6256697997ee27819b66b038f71886f7fcdebf2e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 15 Oct 2025 16:25:49 +0800
Subject: [PATCH 27/51] [Doc] ruff format remaining Python examples (#26795)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/features/quantization/auto_awq.md        | 10 +--
 docs/features/quantization/bitblas.md         |  4 +-
 docs/features/quantization/bnb.md             |  4 +-
 docs/features/quantization/fp8.md             |  9 ++-
 docs/features/quantization/gguf.md            | 12 ++--
 docs/features/quantization/gptqmodel.md       |  2 +-
 docs/features/quantization/int4.md            |  6 +-
 docs/features/quantization/int8.md            |  4 +-
 docs/features/quantization/modelopt.md        |  4 +-
 .../quantization/quantized_kvcache.md         |  8 ++-
 docs/features/quantization/quark.md           | 65 +++++++++++++------
 docs/getting_started/quickstart.md            |  8 ++-
 docs/models/extensions/tensorizer.md          |  4 +-
 docs/models/generative_models.md              |  6 +-
 docs/models/pooling_models.md                 | 26 +++++---
 docs/models/supported_models.md               |  4 +-
 docs/serving/expert_parallel_deployment.md    |  6 +-
 docs/serving/integrations/langchain.md        | 16 +++--
 docs/serving/openai_compatible_server.md      | 57 ++++++++--------
 .../offline_inference/openai_batch/README.md  | 14 +++-
 examples/others/tensorize_vllm_model.py       |  2 +-
 21 files changed, 166 insertions(+), 105 deletions(-)

diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md
index 182127bc91cc8..e77e8b5a1f415 100644
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
     from awq import AutoAWQForCausalLM
     from transformers import AutoTokenizer
 
-    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-    quant_path = 'mistral-instruct-v0.2-awq'
-    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    model_path = "mistralai/Mistral-7B-Instruct-v0.2"
+    quant_path = "mistral-instruct-v0.2-awq"
+    quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
 
     # Load model
     model = AutoAWQForCausalLM.from_pretrained(
-        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+        model_path,
+        low_cpu_mem_usage=True,
+        use_cache=False,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
index 53b689ad53ff6..c3a1276576223 100644
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -34,7 +34,7 @@ llm = LLM(
     model=model_id,
     dtype=torch.bfloat16,
     trust_remote_code=True,
-    quantization="bitblas"
+    quantization="bitblas",
 )
 ```
 
@@ -53,6 +53,6 @@ llm = LLM(
         dtype=torch.float16,
         trust_remote_code=True,
         quantization="bitblas",
-        max_model_len=1024
+        max_model_len=1024,
     )
     ```
diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md
index 3b15a6072d47a..2348c7739c066 100644
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(
     model=model_id,
     dtype=torch.bfloat16,
-    trust_remote_code=True
+    trust_remote_code=True,
 )
 ```
 
@@ -43,7 +43,7 @@ llm = LLM(
     model=model_id,
     dtype=torch.bfloat16,
     trust_remote_code=True,
-    quantization="bitsandbytes"
+    quantization="bitsandbytes",
 )
 ```
 
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index 834c03cbe05b0..a54acdbb96223 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
 
     # Configure the simple PTQ quantization
     recipe = QuantizationModifier(
-      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+        targets="Linear",
+        scheme="FP8_DYNAMIC",
+        ignore=["lm_head"],
+    )
 
     # Apply the quantization algorithm.
     oneshot(model=model, recipe=recipe)
diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md
index 2a1c3bdd775f1..2a731e9b7e032 100644
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
       conversation = [
          {
             "role": "system",
-            "content": "You are a helpful assistant"
+            "content": "You are a helpful assistant",
          },
          {
             "role": "user",
-            "content": "Hello"
+            "content": "Hello",
          },
          {
             "role": "assistant",
-            "content": "Hello! How can I assist you today?"
+            "content": "Hello! How can I assist you today?",
          },
          {
             "role": "user",
@@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
       sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
       # Create an LLM.
-      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+      llm = LLM(
+         model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+      )
       # Generate texts from the prompts. The output is a list of RequestOutput objects
       # that contain the prompt, generated text, and other information.
       outputs = llm.chat(conversation, sampling_params)
diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md
index 47cb2d65bae47..f14a931725da4 100644
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
     calibration_dataset = load_dataset(
         "allenai/c4",
         data_files="en/c4-train.00001-of-01024.json.gz",
-        split="train"
+        split="train",
     ).select(range(1024))["text"]
 
     quant_config = QuantizeConfig(bits=4, group_size=128)
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index d6fdac7b07f7f..5d8e06ffb5d77 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
         },
         ignore=["lm_head"],
         update_size=NUM_CALIBRATION_SAMPLES,
-        dampening_frac=0.01
+        dampening_frac=0.01,
     )
     ```
 
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index af3650e701ad0..ee1de21460573 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md
index 39ae03b1bdac0..c48ccb719a79d 100644
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
     from vllm import LLM, SamplingParams
 
     def main():
-
         model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
-        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+
+        # Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
         llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
 
         sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index b2b417309e92b..e0585a88451d4 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
     from vllm import LLM, SamplingParams
 
     sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-            kv_cache_dtype="fp8",
-            calculate_kv_scales=True)
+    llm = LLM(
+        model="meta-llama/Llama-2-7b-chat-hf",
+        kv_cache_dtype="fp8",
+        calculate_kv_scales=True,
+    )
     prompt = "London is the capital of"
     out = llm.generate(prompt, sampling_params)[0].outputs[0].text
     print(out)
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index 85b7d8ec84ed3..f0cd20b7335c2 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -48,7 +48,9 @@ to fetch model and tokenizer.
     MAX_SEQ_LEN = 512
 
     model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID, device_map="auto", torch_dtype="auto",
+        MODEL_ID,
+        device_map="auto",
+        torch_dtype="auto",
     )
     model.eval()
 
@@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
     dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
     text_data = dataset["text"][:NUM_CALIBRATION_DATA]
 
-    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
-        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
-    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
-        batch_size=BATCH_SIZE, drop_last=True)
+    tokenized_outputs = tokenizer(
+        text_data,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=MAX_SEQ_LEN,
+    )
+    calib_dataloader = DataLoader(
+        tokenized_outputs['input_ids'],
+        batch_size=BATCH_SIZE,
+        drop_last=True,
+    )
     ```
 
 ### 3. Set the Quantization Configuration
@@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
                                         load_quant_algo_config_from_file)
 
     # Define fp8/per-tensor/static spec.
-    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-        is_dynamic=False).to_quantization_spec()
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
+        observer_method="min_max",
+        is_dynamic=False,
+    ).to_quantization_spec()
 
     # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-        weight=FP8_PER_TENSOR_SPEC)
+    global_quant_config = QuantizationConfig(
+        input_tensors=FP8_PER_TENSOR_SPEC,
+        weight=FP8_PER_TENSOR_SPEC,
+    )
 
     # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
     KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
     kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-    kv_cache_quant_config = {name :
-        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
-                        weight=global_quant_config.weight,
-                        output_tensors=KV_CACHE_SPEC)
-        for name in kv_cache_layer_names_for_llama}
+    kv_cache_quant_config = {
+        name: QuantizationConfig(
+            input_tensors=global_quant_config.input_tensors,
+            weight=global_quant_config.weight,
+            output_tensors=KV_CACHE_SPEC,
+        )
+        for name in kv_cache_layer_names_for_llama
+    }
     layer_quant_config = kv_cache_quant_config.copy()
 
     # Define algorithm config by config file.
-    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
-        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
     algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
 
     EXCLUDE_LAYERS = ["lm_head"]
@@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
         layer_quant_config=layer_quant_config,
         kv_cache_quant_config=kv_cache_quant_config,
         exclude=EXCLUDE_LAYERS,
-        algo_config=algo_config)
+        algo_config=algo_config,
+    )
     ```
 
 ### 4. Quantize the Model and Export
@@ -165,8 +182,11 @@ for more exporting format details.
     EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
     exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
     with torch.no_grad():
-        exporter.export_safetensors_model(freezed_model,
-            quant_config=quant_config, tokenizer=tokenizer)
+        exporter.export_safetensors_model(
+            freezed_model,
+            quant_config=quant_config,
+            tokenizer=tokenizer,
+        )
     ```
 
 ### 5. Evaluation in vLLM
@@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
     # Create an LLM.
-    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-            kv_cache_dtype='fp8',quantization='quark')
+    llm = LLM(
+        model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+        kv_cache_dtype="fp8",
+        quantization="quark",
+    )
     # Generate texts from the prompts. The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
     outputs = llm.generate(prompts, sampling_params)
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 49e1f6fac7151..1cba21cf5f6d9 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
         api_key=openai_api_key,
         base_url=openai_api_base,
     )
-    completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
-                                        prompt="San Francisco is a")
+    completion = client.completions.create(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        prompt="San Francisco is a",
+    )
     print("Completion result:", completion)
     ```
 
@@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package:
         messages=[
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": "Tell me a joke."},
-        ]
+        ],
     )
     print("Chat response:", chat_response)
     ```
diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
index f70ab0c6f4e5c..3df80d5af6c4d 100644
--- a/docs/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@@ -60,7 +60,7 @@ from vllm import LLM
 llm = LLM(
     "s3://my-bucket/vllm/facebook/opt-125m/v1", 
     load_format="tensorizer",
-    enable_lora=True
+    enable_lora=True,
 )
 ```
 
@@ -97,6 +97,6 @@ llm = LLM(
     "s3://my-bucket/vllm/facebook/opt-125m/v1", 
     load_format="tensorizer",
     enable_lora=True,
-    model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}
+    model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
 )
 ```
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
index 05f8d16cc4ca7..9ea32ed616457 100644
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
     conversation = [
         {
             "role": "system",
-            "content": "You are a helpful assistant"
+            "content": "You are a helpful assistant",
         },
         {
             "role": "user",
-            "content": "Hello"
+            "content": "Hello",
         },
         {
             "role": "assistant",
-            "content": "Hello! How can I assist you today?"
+            "content": "Hello! How can I assist you today?",
         },
         {
             "role": "user",
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 50982d3d0d0f3..45bfba2cbf594 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u
 from vllm import LLM
 
 llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
-(output,) = llm.score("What is the capital of France?",
-                      "The capital of Brazil is Brasilia.")
+(output,) = llm.score(
+    "What is the capital of France?",
+    "The capital of Brazil is Brasilia.",
+)
 
 score = output.outputs.score
 print(f"Score: {score}")
@@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please
 
 Here is an example to serve a model with Matryoshka Embeddings enabled.
 
-```text
+```bash
 vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
 ```
 
@@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka
 ```python
 from vllm import LLM, PoolingParams
 
-llm = LLM(model="jinaai/jina-embeddings-v3",
-          runner="pooling",
-          trust_remote_code=True)
-outputs = llm.embed(["Follow the white rabbit."],
-                    pooling_params=PoolingParams(dimensions=32))
+llm = LLM(
+    model="jinaai/jina-embeddings-v3",
+    runner="pooling",
+    trust_remote_code=True,
+)
+outputs = llm.embed(
+    ["Follow the white rabbit."],
+    pooling_params=PoolingParams(dimensions=32),
+)
 print(outputs[0].outputs)
 ```
 
@@ -234,13 +240,13 @@ A code example can be found here: <gh-file:examples/offline_inference/pooling/em
 
 Use the following command to start vllm server.
 
-```text
+```bash
 vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
 ```
 
 You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
 
-```text
+```bash
 curl http://127.0.0.1:8000/v1/embeddings \
   -H 'accept: application/json' \
   -H 'Content-Type: application/json' \
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 3ae24f602d8c2..4ba6a72e8a869 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -278,8 +278,8 @@ https_proxy=http://your.proxy.server:port  vllm serve <model_name>
 ```python
 import os
 
-os.environ['http_proxy'] = 'http://your.proxy.server:port'
-os.environ['https_proxy'] = 'http://your.proxy.server:port'
+os.environ["http_proxy"] = "http://your.proxy.server:port"
+os.environ["https_proxy"] = "http://your.proxy.server:port"
 ```
 
 ### ModelScope
diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md
index cd6515dde75ef..f1dfb05ea5d45 100644
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@@ -243,10 +243,10 @@ try:
                 "remote_engine_id": None,     # Will be populated by vLLM
                 "remote_block_ids": None,     # Will be populated by vLLM
                 "remote_host": None,          # Will be populated by vLLM
-                "remote_port": None           # Will be populated by vLLM
+                "remote_port": None,          # Will be populated by vLLM
             }
         },
-        extra_headers={"X-Request-Id": request_id}
+        extra_headers={"X-Request-Id": request_id},
     )
     
     print("-" * 50)
@@ -262,7 +262,7 @@ try:
         extra_body={
             "kv_transfer_params": prefill_response.kv_transfer_params  # Pass KV cache info
         },
-        extra_headers={"X-Request-Id": request_id}  # Same request ID
+        extra_headers={"X-Request-Id": request_id},  # Same request ID
     )
     
     print("-" * 50)
diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md
index 47074f411ac99..192a61ea5b903 100644
--- a/docs/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
     ```python
     from langchain_community.llms import VLLM
 
-    llm = VLLM(model="mosaicml/mpt-7b",
-            trust_remote_code=True,  # mandatory for hf models
-            max_new_tokens=128,
-            top_k=10,
-            top_p=0.95,
-            temperature=0.8,
-            # tensor_parallel_size=... # for distributed inference
+    llm = VLLM(
+        model="mosaicml/mpt-7b",
+        trust_remote_code=True,  # mandatory for hf models
+        max_new_tokens=128,
+        top_k=10,
+        top_p=0.95,
+        temperature=0.8,
+        # for distributed inference
+        # tensor_parallel_size=...,
     )
 
     print(llm("What is the capital of France ?"))
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index fe0e1e3df378b..215c7bf0ced3c 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an
     completion = client.chat.completions.create(
         model="NousResearch/Meta-Llama-3-8B-Instruct",
         messages=[
-            {"role": "user", "content": "Hello!"}
-        ]
+            {"role": "user", "content": "Hello!"},
+        ],
     )
 
     print(completion.choices[0].message)
@@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below:
 completion = client.chat.completions.create(
     model="NousResearch/Meta-Llama-3-8B-Instruct",
     messages=[
-        {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
-    ]
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"},
+            ],
+        },
+    ],
 )
 ```
 
@@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
 completion = client.chat.completions.create(
     model="NousResearch/Meta-Llama-3-8B-Instruct",
     messages=[
-        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
     ],
     extra_body={
-        "structured_outputs": {"choice": ["positive", "negative"]}
-    }
+        "structured_outputs": {"choice": ["positive", "negative"]},
+    },
 )
 ```
 
@@ -149,11 +154,11 @@ with `--enable-request-id-headers`.
     completion = client.chat.completions.create(
         model="NousResearch/Meta-Llama-3-8B-Instruct",
         messages=[
-            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
         ],
         extra_headers={
             "x-request-id": "sentiment-classification-00001",
-        }
+        },
     )
     print(completion._request_id)
 
@@ -162,7 +167,7 @@ with `--enable-request-id-headers`.
         prompt="A robot may not injure a human being",
         extra_headers={
             "x-request-id": "completion-test",
-        }
+        },
     )
     print(completion._request_id)
     ```
@@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi
             model="openai/whisper-large-v3-turbo",
             file=audio_file,
             language="en",
-            response_format="verbose_json"
+            response_format="verbose_json",
         )
 
     print(transcription.text)
@@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including
                 "model": "jinaai/jina-reranker-m0",
                 "text_1": "slm markdown",
                 "text_2": {
-                  "content": [
-                          {
-                              "type": "image_url",
-                              "image_url": {
-                                  "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
-                              },
-                          },
-                          {
-                              "type": "image_url",
-                              "image_url": {
-                                  "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-                              },
-                          },
-                      ]
-                  }
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+                            },
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+                            },
+                        },
+                    ],
                 },
+            },
         )
         response.raise_for_status()
         response_json = response.json()
diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md
index 3c6f6c7a6c588..7d5a1af8f5a4a 100644
--- a/examples/offline_inference/openai_batch/README.md
+++ b/examples/offline_inference/openai_batch/README.md
@@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
     """
     try:
         url = s3_client.generate_presigned_url(
-            ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in
+            ClientMethod=client_method,
+            Params=method_parameters,
+            ExpiresIn=expires_in,
         )
     except ClientError:
         raise
@@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
 
 s3_client = boto3.client("s3")
 input_url = generate_presigned_url(
-    s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600
+    s3_client,
+    "get_object",
+    {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"},
+    expires_in=3600,
 )
 output_url = generate_presigned_url(
-    s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600
+    s3_client,
+    "put_object",
+    {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"},
+    expires_in=3600,
 )
 print(f"{input_url=}")
 print(f"{output_url=}")
diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
index acbfd8cda489a..2601c9eff971b 100644
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -84,7 +84,7 @@ directly to load models:
 from vllm import LLM
 llm = LLM(
     "s3://my-bucket/vllm/facebook/opt-125m/v1", 
-    load_format="tensorizer"
+    load_format="tensorizer",
 )
 ```
 

From 650b51f9f90dcd0393fe33a55e25bd2688874b25 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 15 Oct 2025 16:33:52 +0800
Subject: [PATCH 28/51] [doc] add Context Parallel Deployment doc (#26877)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/serving/context_parallel_deployment.md | 47 +++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 docs/serving/context_parallel_deployment.md

diff --git a/docs/serving/context_parallel_deployment.md b/docs/serving/context_parallel_deployment.md
new file mode 100644
index 0000000000000..dacdf312ee55b
--- /dev/null
+++ b/docs/serving/context_parallel_deployment.md
@@ -0,0 +1,47 @@
+# Context Parallel Deployment
+
+Context parallel mainly solves the problem of serving long context requests. As prefill and decode present quite different characteristics and have quite different SLO (service level objectives), we need to implement context parallel separately for them. The major considerations are:
+
+- For long context prefill, we need to control the TTFT (time to first token) by amortizing the computation time of the prefill across query tokens.
+- For long context decode, we need more space for KV cache to increase the batchsize (and hence the throughput).
+
+## Prefill Context Parallel
+
+During prefill, for a long request with `T` new tokens, we need to compute query/key/value tensors for these new tokens. Say we have `N` GPUs, we can split the request into `N` chunks, and each GPU computes one chunk of the query/key/value tensors.
+
+Depending on the use case, there're two possible strategies:
+
+1. Partial query, full key/value: If the request token length is moderately long (we can afford holding the full key/value tensors), and the goal is to accelerate the prefill (and amortize the computation time of the prefill across query tokens), then we can gather the key/value tensors from all GPUs and let each GPU compute the attention output corresponding to the query tokens of its chunk.
+2. Partial query, partial key/value: If the request token length is too long, we cannot afford holding the full key/value tensors anymore, then we can only compute one chunk of query/key/value tensors for each GPU, and use techniques like [ring-attention](http://arxiv.org/abs/2310.01889) to send/recv key/value tensors chunk by chunk.
+
+Both approaches are under active development.
+
+## Decode Context Parallel
+
+Due to the auto-regressive nature of decoding, every decoding step needs to compute a small amount of query tokens w.r.t. a large number of key/value tokens stored in the paged KV cache. The core of decode context parallel is how to shard the KV cache across GPUs.
+
+For a model with `H` kv-heads, a request with `T` tokens in the context needs to store `H * T` key/value tensors in the KV cache.
+
+1. If one GPU can hold them all, and the performance is good enough, then no parallelization is needed.
+2. If one GPU cannot hold them all, or we want to hold more requests in the KV cache, we can first shard the KV cache along the `H` dimension, that's the plain tensor parallel sharding. It's as simple as adding `-tp <num_gpus>` to the command line.
+3. Since `H` is limited (determined by the model architecture), when we continue to increase the tensor parallel size, the KV cache for each GPU will be duplicated for `tp_size / H` times. Of course, duplication is not good for efficiency. Then we need to add decode context parallel to further shard the KV cache along the `T` dimension. This is as simple as adding `-dcp <size>` to the command line. Note that `size` does not increase the number of GPUs we need to launch, but just reduces the KV cache duplication. The dcp size should lie in the range of `[1, tp_size/H]`. With larger dcp size, the KV cache duplication is reduced, but the communication overhead increases.
+
+Theoretically, it is possible to extend the dcp size beyond `tp_size / H` to further shard the KV cache and accelerate the decoding phase. However, since the number of query tokens is limited in decoding, it's unclear what should we do for the remaining `dcp_size - tp_size / H` GPUs for non-attention layers. For the sake of simplicity, dcp size is upper bounded by `tp_size / H`. If you want to further accelerate the decoding phase, you can consider increasing the `tp_size` first, and then increasing the dcp size.
+
+Note that kv cache can grow during decoding, and the sharding strategy needs to be carefully implemented. We use an interleaving strategy to shard the KV cache along the `T` dimension, so that kv cache for future tokens can be naturally sharded along the `T` dimension. This is proposed by [Chao Hong from Moonshot](https://github.com/youzhedian), and also explained in details in [this paper](http://arxiv.org/abs/2507.07120).
+
+Case study:
+
+For DeepSeek-R1, we have 1 kv-head when MLA is enabled. The typical single-node deployment with `-tp 8` causes 8x KV cache duplication. We can consider adding `-dcp 8` to reduce the KV cache duplication.
+
+For Kimi-K2, the architecture is similar to DeepSeek-R1, but with more parameters. When we deploy it with `-tp 16`, the KV cache duplication is 16x. We can add `-dcp 16` to completely remove the KV cache duplication, at the cost of more communication overhead. We can also add `-dcp 8` to reduce the KV cache duplication to 2x. Although it still duplicates the KV cache twice, the communication overhead is smaller since the DCP communication only happens inside one node.
+
+For Qwen3-235B-A22B, we have 4 kv-heads. When we deploy it with `-tp 8`, the KV cache duplication is 2x. Then we can add `-dcp 2` to remove the KV cache duplication.
+
+In short, for decode context parallel, try to increase `-tp` size until you get satisfactory performance, and then add `-dcp` to reduce the KV cache duplication.
+
+Decode context parallel is supported in vLLM, for both MLA and GQA models. Some attention backends also support the combination of decode context parallel and MTP (multi-token prediction) to further accelerate the decoding phase.
+
+## Technical Discussions
+
+The main discussions happen in the `#sig-context-parallel` channel of [vLLM Slack](https://slack.vllm.ai/).

From 5210dc3940b0f6554a6db46287281d1be9d187ed Mon Sep 17 00:00:00 2001
From: Xudong Ma <madongfly@gmail.com>
Date: Wed, 15 Oct 2025 01:37:49 -0700
Subject: [PATCH 29/51] [Misc] Update TritonLanguagePlaceholder to have
 attributes that are used by Flash Linear Attention ops. (#26853)

Co-authored-by: Xudong Ma <mxd@meta.com>
---
 vllm/triton_utils/importing.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index e1a509a303c53..f05bc555bfdc3 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -98,3 +98,6 @@ class TritonLanguagePlaceholder(types.ModuleType):
         self.int64 = None
         self.int32 = None
         self.tensor = None
+        self.exp = None
+        self.log = None
+        self.log2 = None

From 5c3bae1a6a73aad8c0883a097079448b506fbfcc Mon Sep 17 00:00:00 2001
From: ant-yy <vito.yy@antgroup.com>
Date: Wed, 15 Oct 2025 16:44:04 +0800
Subject: [PATCH 30/51] [Fix] Remove divisibility requirement between
 num_kv_heads and tp_size in bailing_moe (#26876)

Signed-off-by: vito.yy <vito.yy@antgroup.com>
---
 vllm/model_executor/models/bailing_moe.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index a7f3ebed644fc..1549c653482f6 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -86,13 +86,12 @@ class BailingAttention(nn.Module):
         tp_size = get_tensor_model_parallel_world_size()
 
         assert self.total_num_heads % tp_size == 0
-        assert self.total_kv_heads % tp_size == 0
         assert self.total_num_heads >= self.total_kv_heads
 
         self.num_heads = self.total_num_heads // tp_size
         self.head_dim = config.head_dim or (self.hidden_size // self.total_num_heads)
         self.q_size_per_rank = self.head_dim * self.num_heads
-        self.num_kv_heads = self.total_kv_heads // tp_size
+        self.num_kv_heads = max(1, self.total_kv_heads // tp_size)
         self.kv_size_per_rank = self.num_kv_heads * self.head_dim
         self.scale = self.head_dim**-0.5
         self.use_qk_norm = getattr(config, "use_qk_norm", False)

From 7f83b4ee8ea4577f1c4b32e547b6917b9c6e2d3d Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Wed, 15 Oct 2025 02:17:43 -0700
Subject: [PATCH 31/51] [Easy] Get rid of unnecessary paraenthesis in
 kv_cache_manager (#26842)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 vllm/v1/core/kv_cache_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index ff221048dbd19..74176e4b2051c 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -219,7 +219,7 @@ class KVCacheManager:
                 self.prefix_cache_stats.queries += request.num_tokens
                 self.prefix_cache_stats.hits += num_new_computed_tokens
 
-        return (self.create_kv_cache_blocks(computed_blocks), num_new_computed_tokens)
+        return self.create_kv_cache_blocks(computed_blocks), num_new_computed_tokens
 
     def allocate_slots(
         self,

From db1764e4e05b06c93073b9f26df7b1f3b684e638 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Wed, 15 Oct 2025 17:32:17 +0800
Subject: [PATCH 32/51] [Platform] allow platform to init dp group (#22243)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/config/parallel.py     |  2 +-
 vllm/distributed/utils.py   | 26 +++++++++++++-------------
 vllm/platforms/cuda.py      | 34 ----------------------------------
 vllm/platforms/interface.py |  2 +-
 vllm/platforms/rocm.py      | 34 ----------------------------------
 5 files changed, 15 insertions(+), 83 deletions(-)

diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index b7ef0fef68330..944a1e8666f4b 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -334,7 +334,7 @@ class ParallelConfig:
                     self.get_next_dp_init_port(),
                     self.data_parallel_rank,
                     self.data_parallel_size,
-                    backend="gloo",
+                    backend=current_platform.dist_backend,
                 )
             except DistNetworkError as e:
                 # We only want to retry when the root cause is EADDRINUSE.
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 0a1e04ec10f99..a3d9dbe83a124 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -415,7 +415,6 @@ class StatelessProcessGroup:
 
 
 def init_gloo_process_group(
-    backend: Backend,
     prefix_store: PrefixStore,
     group_rank: int,
     group_size: int,
@@ -432,7 +431,7 @@ def init_gloo_process_group(
             group_size,
         )
     else:
-        options = ProcessGroup.Options(backend=backend)
+        options = ProcessGroup.Options(backend="gloo")
         pg = ProcessGroup(
             prefix_store,
             group_rank,
@@ -504,24 +503,25 @@ def stateless_init_torch_distributed_process_group(
     # Use a PrefixStore to avoid accidental overrides of keys used by
     # different systems (e.g. RPC) in case the store is multi-tenant.
     prefix_store = PrefixStore(init_method, store)
+    try:
+        from vllm.platforms import current_platform
 
-    if backend == "gloo":
-        return init_gloo_process_group(
+        return current_platform.stateless_init_device_torch_dist_pg(
             backend=backend,
             prefix_store=prefix_store,
             group_rank=group_rank,
             group_size=group_size,
             timeout=timeout,
         )
-    from vllm.platforms import current_platform
-
-    return current_platform.stateless_init_device_torch_dist_pg(
-        backend=backend,
-        prefix_store=prefix_store,
-        group_rank=group_rank,
-        group_size=group_size,
-        timeout=timeout,
-    )
+    except NotImplementedError:
+        # If platform doesn't implement stateless_init_device_torch_dist_pg, it
+        # will raise a NotImplementedError. In this case, we fall back to gloo.
+        return init_gloo_process_group(
+            prefix_store=prefix_store,
+            group_rank=group_rank,
+            group_size=group_size,
+            timeout=timeout,
+        )
 
 
 def stateless_destroy_torch_distributed_process_group(pg: ProcessGroup) -> None:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 0252c3acb08c1..04c2bbb43805b 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -6,13 +6,10 @@ pynvml. However, it should not initialize cuda context.
 
 import os
 from collections.abc import Callable
-from datetime import timedelta
 from functools import cache, wraps
 from typing import TYPE_CHECKING, TypeVar
 
 import torch
-from torch.distributed import PrefixStore, ProcessGroup
-from torch.distributed.distributed_c10d import is_nccl_available
 from typing_extensions import ParamSpec
 
 # import custom ops, trigger op registration
@@ -455,37 +452,6 @@ class CudaPlatformBase(Platform):
     def get_static_graph_wrapper_cls(cls) -> str:
         return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
 
-    @classmethod
-    def stateless_init_device_torch_dist_pg(
-        cls,
-        backend: str,
-        prefix_store: PrefixStore,
-        group_rank: int,
-        group_size: int,
-        timeout: timedelta,
-    ) -> ProcessGroup:
-        assert is_nccl_available()
-        pg: ProcessGroup = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-        )
-        from torch.distributed.distributed_c10d import ProcessGroupNCCL
-
-        backend_options = ProcessGroupNCCL.Options()
-        backend_options._timeout = timeout
-
-        backend_class = ProcessGroupNCCL(
-            prefix_store, group_rank, group_size, backend_options
-        )
-        backend_type = ProcessGroup.BackendType.NCCL
-        device = torch.device("cuda")
-        pg._set_default_backend(backend_type)
-        backend_class._set_sequence_number_for_group()
-
-        pg._register_backend(device, backend_type, backend_class)
-        return pg
-
     @classmethod
     def device_count(cls) -> int:
         return cuda_device_count_stateless()
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 9b8d75ac22fe0..f08e62a4aa9c2 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -551,7 +551,7 @@ class Platform:
         """
         Init platform-specific torch distributed process group.
         """
-        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
+        raise NotImplementedError
 
     @classmethod
     def is_kv_cache_dtype_supported(
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 81745257d0ae2..8fa07b10d34aa 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -2,13 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from datetime import timedelta
 from functools import cache, lru_cache, wraps
 from typing import TYPE_CHECKING
 
 import torch
-from torch.distributed import PrefixStore, ProcessGroup
-from torch.distributed.distributed_c10d import is_nccl_available
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -476,37 +473,6 @@ class RocmPlatform(Platform):
     def get_static_graph_wrapper_cls(cls) -> str:
         return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
 
-    @classmethod
-    def stateless_init_device_torch_dist_pg(
-        cls,
-        backend: str,
-        prefix_store: PrefixStore,
-        group_rank: int,
-        group_size: int,
-        timeout: timedelta,
-    ) -> ProcessGroup:
-        assert is_nccl_available()
-        pg: ProcessGroup = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-        )
-        from torch.distributed.distributed_c10d import ProcessGroupNCCL
-
-        backend_options = ProcessGroupNCCL.Options()
-        backend_options._timeout = timeout
-
-        backend_class = ProcessGroupNCCL(
-            prefix_store, group_rank, group_size, backend_options
-        )
-        backend_type = ProcessGroup.BackendType.NCCL
-        device = torch.device("cuda")
-        pg._set_default_backend(backend_type)
-        backend_class._set_sequence_number_for_group()
-
-        pg._register_backend(device, backend_type, backend_class)
-        return pg
-
     @classmethod
     def device_count(cls) -> int:
         return cuda_device_count_stateless()

From d4d1a6024f526129941d2fecaec504d93f9072ed Mon Sep 17 00:00:00 2001
From: li2haipeng <44383182+li2haipeng@users.noreply.github.com>
Date: Wed, 15 Oct 2025 02:45:14 -0700
Subject: [PATCH 33/51] [Lora]Load tuned multi-lora kernel configs from json
 files (#26319)

Signed-off-by: li2haipeng <44383182+li2haipeng@users.noreply.github.com>
Signed-off-by: Haipeng Li <li2haipeng@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/ops/triton_ops/README_TUNING.md  |  51 +++++++++
 vllm/lora/ops/triton_ops/lora_expand_op.py |  23 +++--
 vllm/lora/ops/triton_ops/lora_shrink_op.py |  25 +++--
 vllm/lora/ops/triton_ops/utils.py          | 115 +++++++++++++++++++++
 4 files changed, 198 insertions(+), 16 deletions(-)
 create mode 100644 vllm/lora/ops/triton_ops/README_TUNING.md

diff --git a/vllm/lora/ops/triton_ops/README_TUNING.md b/vllm/lora/ops/triton_ops/README_TUNING.md
new file mode 100644
index 0000000000000..fda95ea71891f
--- /dev/null
+++ b/vllm/lora/ops/triton_ops/README_TUNING.md
@@ -0,0 +1,51 @@
+# Multi-LoRA Tuning
+
+**Note**: The LoRA configuration folder should be specified by exporting `VLLM_TUNED_CONFIG_FOLDER=/path/to/configs`. Without this, the shrink/expand kernels will use default configurations.
+
+## Tuning Process
+
+Multi-lora shrink/expand Triton kernel tuning follows a similar methodology from [Triton MoE tuning](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_moe.py).
+
+**Step 1**
+Define the searching space. An example searching space:
+
+```python
+block_m_range = [16, 32, 64, 128, 256]
+block_n_range = [32, 64, 128, 256]
+block_k_range = [32, 64, 128, 256]
+num_warps_range = [4, 8]
+num_stage_range = [2, 3, 4, 5]
+num_ctas_range = [1]
+split_k_range = [4, 8, 16, 32, 64]
+```
+
+**Step 2**
+Get all hidden_state sizes and num_slices that the target model uses for a specific TP size.
+
+For example, we can aquire those info by simply checking [add_lora_linear](https://github.com/li2haipeng/vllm/blob/multi_lora_v01011/vllm/lora/punica_wrapper/punica_gpu.py#L192):
+
+```python
+print(f"x_shape: {x.view(-1, x.shape[-1]).shape}")
+print(f"num_sclises: {len(output_slices)}")
+for i in range(len(output_slices)):
+    print(f"a{i} shape: {lora_a_stacked[i].shape}")
+    print(f"b{i} shape: {lora_b_stacked[i].shape}")
+print("y_shape", y.shape)
+```
+
+**Step 3**
+Benchmark the shrink/expand kernel runtime with different kernel configurations generated from the pre-defined search space by performing a grid search to find the optimal kernel configuration. vLLM's [benchmark_lora.py](https://github.com/vllm-project/vllm/blob/main/benchmarks/kernels/benchmark_lora.py) can be used to search for configurations for different shapes.
+
+## Config Files
+
+### File Name
+
+For `shrink`, the config file is named as `{gpu_name}_SHRINK.json`, e.g. `NVIDIA_H200_SHRINK.json`.
+
+For `expand`, the config fileis named as `{gpu_name}_EXPAND_{add_input}.json`, e.g. `NVIDIA_H200_EXPAND_TRUE.json`.
+
+The `gpu_name` can be automatically detected by calling `torch.cuda.get_device_name()`
+
+### Json Structure
+
+Optimal kernel configuration files are saved as JSON files with the structure `config_data[max_loras][num_slices][m][k][n]`
diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py
index a7a552b9903d5..c8330455985aa 100644
--- a/vllm/lora/ops/triton_ops/lora_expand_op.py
+++ b/vllm/lora/ops/triton_ops/lora_expand_op.py
@@ -10,7 +10,7 @@ https://arxiv.org/abs/2310.18547
 import torch
 
 from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel
-from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr
+from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr, get_lora_op_configs
 from vllm.triton_utils import tl, triton
 from vllm.utils import direct_register_custom_op
 
@@ -201,12 +201,21 @@ def _lora_expand(
     NUM_SLICES = len(lora_b_weights)
 
     # Triton kernel configs.
-    BLOCK_M = 64
-    BLOCK_N = 128
-    BLOCK_K = 16
-    NUM_WARPS = 4
-    NUM_CTAS = 1
-    NUM_STAGES = 2
+    kernel_config = get_lora_op_configs(
+        op_type="expand",
+        max_loras=MAX_LORAS,
+        batch=M,
+        hidden_size=MAX_N,
+        rank=K,
+        num_slices=NUM_SLICES,
+        add_inputs=add_inputs,
+    )
+    BLOCK_M = kernel_config["block_m"]
+    BLOCK_N = kernel_config["block_n"]
+    BLOCK_K = kernel_config["block_k"]
+    NUM_WARPS = kernel_config["num_warps"]
+    NUM_CTAS = kernel_config["num_ctas"]
+    NUM_STAGES = kernel_config["num_stages"]
 
     EVEN_K = K % BLOCK_K == 0  # type: ignore
 
diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py
index 1e7e43e30de78..9cba8f4944486 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink_op.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py
@@ -10,7 +10,7 @@ https://arxiv.org/abs/2310.18547
 import torch
 
 from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel
-from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr
+from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr, get_lora_op_configs
 from vllm.triton_utils import tl, triton
 from vllm.utils import direct_register_custom_op
 
@@ -177,14 +177,21 @@ def _lora_shrink(
     MAX_LORAS = lora_ids.size(0)
 
     # Triton kernel configs
-    BLOCK_M = 32
-    BLOCK_N = 16
-    BLOCK_K = 256 if M < 128 else 32
-    SPLIT_K = 64 if M < 128 else 8
-    NUM_WARPS = 4
-    NUM_CTAS = 1
-    NUM_STAGES = 2
-
+    kernel_config = get_lora_op_configs(
+        "shrink",
+        max_loras=MAX_LORAS,
+        batch=M,
+        hidden_size=K,
+        rank=N,
+        num_slices=NUM_SLICES,
+    )
+    BLOCK_M = kernel_config["block_m"]
+    BLOCK_N = kernel_config["block_n"]
+    BLOCK_K = kernel_config["block_k"]
+    SPLIT_K = kernel_config["split_k"]
+    NUM_WARPS = kernel_config["num_warps"]
+    NUM_STAGES = kernel_config["num_stages"]
+    NUM_CTAS = kernel_config["num_ctas"]
     EVEN_K = K % (BLOCK_K * SPLIT_K) == 0  # type: ignore
 
     # TODO (varun): This grid formulation maximizes parallelization at the
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 3a3e8fc8931e8..9ffb6dc3d85e5 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -1,8 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
+import json
+from pathlib import Path
+from typing import Any
+
 import torch
 
+from vllm import envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 _LORA_A_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
 _LORA_B_PTR_DICT: dict[tuple[int, ...], tuple[torch.tensor, ...]] = {}
 
@@ -133,3 +143,108 @@ def _get_lora_b_ptr(
         MAX_N,
     )
     return _LORA_B_PTR_DICT.get(key)
+
+
+@functools.lru_cache
+def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None:
+    user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER
+    if user_defined_config_folder is not None:
+        gpu_name = torch.cuda.get_device_name()
+        gpu_name = gpu_name.replace(" ", "_")
+        gpu_name = gpu_name.replace("-", "_")
+
+        config_fname = None
+        if op_type == "shrink":
+            config_fname = f"{gpu_name}_{op_type.upper()}.json"
+        else:
+            assert op_type == "expand"
+            config_fname = (
+                f"{gpu_name}_{op_type.upper()}_{str(add_inputs).upper()}.json"
+            )
+
+        config_path = Path(f"{user_defined_config_folder}/{config_fname}")
+        if not config_path.exists():
+            logger.warning_once(f"No LoRA kernel configs founded in {config_path}")
+            return None
+
+        # Load json
+        logger.info_once(f"Using tuned LoRA kernel configs from {config_path}.")
+        with open(str(config_path)) as f:
+            config_data = json.load(f)
+    else:
+        config_data = None
+
+    return config_data
+
+
+@functools.lru_cache
+def get_lora_op_configs(
+    op_type: str,
+    max_loras: int,
+    batch: int,
+    hidden_size: int,
+    rank: int,
+    num_slices: int,
+    add_inputs: bool | None = None,
+) -> dict[str, int | None]:
+    assert op_type in ["shrink", "expand"]
+
+    # default config
+    default = {}
+    if op_type == "shrink":
+        default = {
+            "block_m": 32,
+            "block_n": 16,
+            "block_k": 256 if batch < 128 else 32,
+            "split_k": 64 if batch < 128 else 8,
+            "num_warps": 4,
+            "num_ctas": 1,
+            "num_stages": 2,
+            "max_nreg": None,
+        }
+    else:
+        default = {
+            "block_m": 64,
+            "block_n": 128,
+            "block_k": 16,
+            "num_warps": 4,
+            "num_ctas": 1,
+            "num_stages": 2,
+            "max_nreg": None,
+        }
+    m = batch
+
+    k, n = (hidden_size, rank) if op_type == "shrink" else (rank, hidden_size)
+
+    config_data: Any
+    config_data = load_lora_op_config(op_type, add_inputs)
+    if not config_data:
+        logger.warning_once("Using default LoRA kernel configs")
+        return default
+
+    # config is structured as config_data[max_loras][num_slices][m][k][n] = {}
+    # slice by max_loras
+    config_data = (
+        config_data.get(str(max_loras))
+        or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - max_loras))]
+    )
+    # slice by num_slices
+    config_data = config_data[str(num_slices)]
+    # slice by m
+    config_data = (
+        config_data.get(str(m))
+        or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - m))]
+    )
+    # slice by k
+    config_data = (
+        config_data.get(str(k))
+        or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - k))]
+    )
+    # slice by n
+    config_data = (
+        config_data.get(str(n))
+        or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - n))]
+    )
+
+    assert config_data is not None
+    return config_data

From f54f85129e4665c16f39b097463c3c350ef34210 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Wed, 15 Oct 2025 19:14:41 +0800
Subject: [PATCH 34/51] [Model][2/N] Improve all pooling task | Support
 multi-vector retrieval (#25370)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 examples/offline_inference/pooling/README.md  |   6 +
 .../pooling/multi_vector_retrieval.py         |  56 +++
 .../prithvi_geospatial_mae_io_processor.py    |   2 +-
 examples/online_serving/pooling/README.md     |   6 +
 .../pooling/multi_vector_retrieval_client.py  |  54 +++
 tests/conftest.py                             |   8 +-
 .../entrypoints/pooling/llm/test_classify.py  |   2 +-
 .../entrypoints/pooling/llm/test_embedding.py |   7 +
 tests/entrypoints/pooling/llm/test_encode.py  |  12 +-
 tests/entrypoints/pooling/llm/test_reward.py  |  23 +-
 .../pooling/openai/test_embedding.py          |  18 +
 .../entrypoints/pooling/openai/test_rerank.py |  19 +-
 .../pooling/test_multi_vector_retrieval.py    |  45 ++
 .../test_pooler_config_init_behaviour.py      |  58 ++-
 .../pooling/test_token_classification.py      |   4 +-
 .../multimodal/pooling/test_prithvi_mae.py    |   2 +-
 .../my_gemma_embedding.py                     |   2 +-
 .../test_io_processor_plugins.py              |   5 +-
 tests/test_pooling_params.py                  |  93 +++-
 vllm/entrypoints/llm.py                       |  37 +-
 vllm/entrypoints/openai/api_server.py         |  21 +-
 vllm/entrypoints/openai/protocol.py           |   4 +-
 vllm/entrypoints/openai/serving_pooling.py    |  14 +-
 vllm/model_executor/layers/pooler.py          | 422 +++++++++++-------
 vllm/model_executor/models/adapters.py        |  42 +-
 vllm/model_executor/models/bert.py            |  22 +-
 vllm/model_executor/models/bert_with_rope.py  |  14 +-
 vllm/model_executor/models/clip.py            |   2 +-
 vllm/model_executor/models/gpt2.py            |  11 +-
 vllm/model_executor/models/gritlm.py          |   2 +-
 vllm/model_executor/models/internlm2.py       |   2 +-
 vllm/model_executor/models/jamba.py           |  10 +-
 vllm/model_executor/models/jina_vl.py         |  12 +-
 vllm/model_executor/models/modernbert.py      |  20 +-
 vllm/model_executor/models/qwen2_rm.py        |   6 +-
 vllm/model_executor/models/roberta.py         |  26 +-
 vllm/model_executor/models/terratorch.py      |   2 +-
 .../models/transformers_pooling.py            |  18 +-
 vllm/pooling_params.py                        |  61 +--
 vllm/tasks.py                                 |   2 +-
 vllm/v1/worker/gpu_model_runner.py            |  13 +-
 41 files changed, 786 insertions(+), 399 deletions(-)
 create mode 100644 examples/offline_inference/pooling/multi_vector_retrieval.py
 create mode 100644 examples/online_serving/pooling/multi_vector_retrieval_client.py
 create mode 100644 tests/models/language/pooling/test_multi_vector_retrieval.py

diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md
index 79afbd9cfac47..7c535e91afac8 100644
--- a/examples/offline_inference/pooling/README.md
+++ b/examples/offline_inference/pooling/README.md
@@ -26,6 +26,12 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
 python examples/offline_inference/pooling/embed_matryoshka_fy.py
 ```
 
+## Multi vector retrieval usage
+
+```bash
+python examples/offline_inference/pooling/multi_vector_retrieval.py
+```
+
 ## Named Entity Recognition (NER) usage
 
 ```bash
diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/offline_inference/pooling/multi_vector_retrieval.py
new file mode 100644
index 0000000000000..8b8892117d378
--- /dev/null
+++ b/examples/offline_inference/pooling/multi_vector_retrieval.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="BAAI/bge-m3",
+        runner="pooling",
+        enforce_eager=True,
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass runner="pooling" for embedding models
+    llm = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = llm.embed(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        print(len(embeds))
+
+    # Generate embedding for each token. The output is a list of PoolingRequestOutput.
+    outputs = llm.encode(prompts, pooling_task="token_embed")
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        multi_vector = output.outputs.data
+        print(multi_vector.shape)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
index 418c40645f9f2..6c47b57154386 100644
--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@@ -40,7 +40,7 @@ def main():
         model_impl="terratorch",
     )
 
-    pooling_params = PoolingParams(task="encode", softmax=False)
+    pooling_params = PoolingParams(task="token_classify", activation=False)
     pooler_output = llm.encode(
         img_prompt,
         pooling_params=pooling_params,
diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
index ac4e40221edf1..91345e0ae7785 100644
--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@@ -18,6 +18,12 @@ python examples/online_serving/pooling/embedding_embed_dtype_client.py
 python examples/online_serving/pooling/jinaai_rerank_client.py
 ```
 
+## Multi vector retrieval usage
+
+```bash
+python examples/online_serving/pooling/multi_vector_retrieval_client.py
+```
+
 ## Named Entity Recognition (NER) usage
 
 ```bash
diff --git a/examples/online_serving/pooling/multi_vector_retrieval_client.py b/examples/online_serving/pooling/multi_vector_retrieval_client.py
new file mode 100644
index 0000000000000..ef8c4745aa531
--- /dev/null
+++ b/examples/online_serving/pooling/multi_vector_retrieval_client.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Example online usage of Pooling API for multi vector retrieval.
+
+Run `vllm serve <model> --runner pooling`
+to start up the server in vLLM. e.g.
+
+vllm serve BAAI/bge-m3
+"""
+
+import argparse
+
+import requests
+import torch
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="BAAI/bge-m3")
+
+    return parser.parse_args()
+
+
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/pooling"
+    model_name = args.model
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    prompt = {"model": model_name, "input": prompts}
+
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    for output in pooling_response.json()["data"]:
+        multi_vector = torch.tensor(output["data"])
+        print(multi_vector.shape)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/tests/conftest.py b/tests/conftest.py
index 2fde7f97836d6..9126b3d668b9c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1011,8 +1011,12 @@ class VllmRunner:
         req_outputs = self.llm.embed(inputs, *args, **kwargs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
-    def encode(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.llm.encode(prompts)
+    def token_embed(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.llm.encode(prompts, pooling_task="token_embed")
+        return [req_output.outputs.data for req_output in req_outputs]
+
+    def token_classify(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.llm.encode(prompts, pooling_task="token_classify")
         return [req_output.outputs.data for req_output in req_outputs]
 
     def reward(self, prompts: list[str]) -> list[list[float]]:
diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py
index 488c82c9fe7fd..96f634ee0a8c7 100644
--- a/tests/entrypoints/pooling/llm/test_classify.py
+++ b/tests/entrypoints/pooling/llm/test_classify.py
@@ -63,7 +63,7 @@ def test_encode_api(llm: LLM):
     # chunked prefill does not support all pooling
     err_msg = "pooling_task must be one of.+"
     with pytest.raises(ValueError, match=err_msg):
-        llm.encode(prompts, use_tqdm=False)
+        llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
 
 
 def test_score_api(llm: LLM):
diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py
index c53941390bd10..5455b5f91fc09 100644
--- a/tests/entrypoints/pooling/llm/test_embedding.py
+++ b/tests/entrypoints/pooling/llm/test_embedding.py
@@ -35,6 +35,13 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
+@pytest.mark.skip_global_cleanup
+def test_encode_api(llm: LLM):
+    outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
+    multi_vector = outputs[0].outputs.data
+    assert multi_vector.shape == (11, 384)
+
+
 def test_pooling_params(llm: LLM):
     def get_outputs(normalize):
         outputs = llm.embed(
diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py
index 9ba380334e5a2..ca85d2758fce4 100644
--- a/tests/entrypoints/pooling/llm/test_encode.py
+++ b/tests/entrypoints/pooling/llm/test_encode.py
@@ -57,20 +57,24 @@ def test_multiple_pooling_params(llm: LLM):
     ]
 
     # Multiple PoolingParams should be matched with each prompt
-    outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
+    outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
     assert len(PROMPTS) == len(outputs)
 
     # Exception raised, if the size of params does not match the size of prompts
     with pytest.raises(ValueError):
-        outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
+        outputs = llm.encode(
+            PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
+        )
 
     # Single PoolingParams should be applied to every prompt
     single_pooling_params = PoolingParams()
-    outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
+    outputs = llm.encode(
+        PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
+    )
     assert len(PROMPTS) == len(outputs)
 
     # pooling_params is None, default params should be applied
-    outputs = llm.encode(PROMPTS, pooling_params=None)
+    outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
     assert len(PROMPTS) == len(outputs)
 
 
diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py
index 8312ff180b36f..81058dbad891b 100644
--- a/tests/entrypoints/pooling/llm/test_reward.py
+++ b/tests/entrypoints/pooling/llm/test_reward.py
@@ -36,22 +36,23 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-@pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
-    def get_outputs(softmax):
+    def get_outputs(activation):
         outputs = llm.reward(
-            prompts, pooling_params=PoolingParams(softmax=softmax), use_tqdm=False
+            prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False
         )
         return torch.cat([x.outputs.data for x in outputs])
 
-    default = get_outputs(softmax=None)
-    w_softmax = get_outputs(softmax=True)
-    wo_softmax = get_outputs(softmax=False)
+    default = get_outputs(activation=None)
+    w_activation = get_outputs(activation=True)
+    wo_activation = get_outputs(activation=False)
 
-    assert torch.allclose(default, w_softmax, atol=1e-2), "Default should use softmax."
-    assert not torch.allclose(w_softmax, wo_softmax, atol=1e-2), (
-        "wo_softmax should not use softmax."
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
     )
-    assert torch.allclose(softmax(wo_softmax), w_softmax, atol=1e-2), (
-        "w_softmax should be close to softmax(wo_softmax)."
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
     )
diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py
index 8a3d298a48e2e..ab8ca9d68e0e7 100644
--- a/tests/entrypoints/pooling/openai/test_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_embedding.py
@@ -17,6 +17,7 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
     EmbeddingResponse,
+    PoolingResponse,
 )
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -509,3 +510,20 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str):
     assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
         "w_normal should be close to normal(wo_normal)."
     )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={"model": model_name, "input": input_text, "encoding_format": "float"},
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 11
+    assert len(poolings.data[0].data[0]) == 384
diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py
index 9980fcff16c15..e43148d25feeb 100644
--- a/tests/entrypoints/pooling/openai/test_rerank.py
+++ b/tests/entrypoints/pooling/openai/test_rerank.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import RerankResponse
+from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
 
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
@@ -159,3 +159,20 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str):
     assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
         "w_activation should be close to activation(wo_activation)."
     )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={"model": model_name, "input": input_text, "encoding_format": "float"},
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 11
+    assert len(poolings.data[0].data[0]) == 1
diff --git a/tests/models/language/pooling/test_multi_vector_retrieval.py b/tests/models/language/pooling/test_multi_vector_retrieval.py
new file mode 100644
index 0000000000000..302f2df135579
--- /dev/null
+++ b/tests/models/language/pooling/test_multi_vector_retrieval.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModel
+
+from tests.models.utils import check_embeddings_close
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["BAAI/bge-m3"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@torch.inference_mode
+def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str):
+    with vllm_runner(
+        model,
+        runner="pooling",
+        max_model_len=None,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.token_embed(example_prompts)
+
+    with hf_runner(
+        model,
+        auto_cls=AutoModel,
+    ) as hf_model:
+        tokenizer = hf_model.tokenizer
+        hf_outputs = []
+        for prompt in example_prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = hf_model.wrap_device(inputs)
+            output = hf_model.model(**inputs)
+            embedding = output.last_hidden_state[0].float()
+            # normal
+            hf_outputs.append(embedding.cpu())
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        check_embeddings_close(
+            embeddings_0_lst=hf_output,
+            embeddings_1_lst=vllm_output,
+            name_0="hf",
+            name_1="vllm",
+            tol=1e-2,
+        )
diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
index 674bf02b7b98b..55663ee3f1b41 100644
--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@@ -93,7 +93,7 @@ def test_embed_models_using_normalize(
     ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
-def test_reward_models_using_softmax(
+def test_reward_models_using_activation(
     hf_runner,
     vllm_runner,
     example_prompts,
@@ -104,22 +104,64 @@ def test_reward_models_using_softmax(
         model,
         max_model_len=1024,
         dtype=dtype,
-        pooler_config=PoolerConfig(softmax=False),
+        pooler_config=PoolerConfig(activation=False),
     ) as vllm_model:
-        wo_softmax = vllm_model.encode(example_prompts)
+        wo_activation = vllm_model.reward(example_prompts)
 
     with vllm_runner(
-        model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True)
+        model,
+        max_model_len=1024,
+        dtype=dtype,
+        pooler_config=PoolerConfig(activation=True),
     ) as vllm_model:
-        w_softmax = vllm_model.encode(example_prompts)
+        w_activation = vllm_model.reward(example_prompts)
 
-    for wo, w in zip(wo_softmax, w_softmax):
+    for wo, w in zip(wo_activation, w_activation):
         wo = torch.tensor(wo)
         w = torch.tensor(w)
 
         assert not torch.allclose(wo, w, atol=1e-2), (
-            "pooler_config softmax is not working"
+            "pooler_config activation is not working"
         )
         assert torch.allclose(softmax(wo), w, atol=1e-2), (
-            "w_softmax should be close to softmax(wo_softmax)."
+            "w_activation should be close to activation(wo_activation)."
+        )
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "intfloat/multilingual-e5-small",
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_multi_vector_retrieval_models_using_normalize(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=False),
+    ) as vllm_model:
+        wo_normalize = vllm_model.token_embed(example_prompts)
+
+    with vllm_runner(
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=True),
+    ) as vllm_model:
+        w_normalize = vllm_model.token_embed(example_prompts)
+
+    for wo, w in zip(wo_normalize, w_normalize):
+        assert not torch.allclose(wo, w, atol=1e-2), (
+            "pooler_config normalize is not working"
+        )
+        assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), (
+            "w_normal should be close to normal(wo_normal)."
         )
diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py
index 784d9fc312679..2dfc0072126bc 100644
--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -19,7 +19,7 @@ def test_bert_models(
     dtype: str,
 ) -> None:
     with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.token_classify(example_prompts)
 
     with hf_runner(
         model, dtype=dtype, auto_cls=AutoModelForTokenClassification
@@ -50,7 +50,7 @@ def test_modernbert_models(
     dtype: str,
 ) -> None:
     with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.token_classify(example_prompts)
 
     with hf_runner(
         model, dtype=dtype, auto_cls=AutoModelForTokenClassification
diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py
index abf4150a91329..62154b0834878 100644
--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -39,7 +39,7 @@ def _run_test(
         max_num_seqs=32,
         default_torch_num_threads=1,
     ) as vllm_model:
-        vllm_model.encode(prompt)
+        vllm_model.llm.encode(prompt, pooling_task="token_classify")
 
 
 MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index d1dae587d38eb..98245cdf0c984 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -30,7 +30,7 @@ class MyGemma2Embedding(nn.Module):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_embed": Pooler.for_token_embed(pooler_config),
                 "embed": Pooler.for_embed(pooler_config),
             }
         )
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
index 912b32755e80f..936f27fb69bc6 100644
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -93,7 +93,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
         out_data_format="b64_json",
     )
 
-    pooling_params = PoolingParams(task="encode", softmax=False)
+    pooling_params = PoolingParams(activation=False)
 
     with vllm_runner(
         model_name,
@@ -108,8 +108,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
         io_processor_plugin="prithvi_to_tiff",
     ) as llm_runner:
         pooler_output = llm_runner.get_llm().encode(
-            img_prompt,
-            pooling_params=pooling_params,
+            img_prompt, pooling_params=pooling_params, pooling_task="token_classify"
         )
     output = pooler_output[0].outputs
 
diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py
index e3561ac3a577e..e73d7efc1483a 100644
--- a/tests/test_pooling_params.py
+++ b/tests/test_pooling_params.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
 import pytest
 
 from tests.models.utils import EmbedModelInfo
 from vllm import PoolingParams
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, PoolerConfig
 
 EMBEDDING_MODELS = [
     EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
@@ -15,6 +17,15 @@ EMBEDDING_MODELS = [
     ),
 ]
 
+classify_parameters = ["activation"]
+embed_parameters = ["dimensions", "normalize"]
+step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
+
+
+@dataclass()
+class MockModelConfig:
+    pooler_config: PoolerConfig
+
 
 def test_task():
     pooling_params = PoolingParams()
@@ -24,25 +35,27 @@ def test_task():
     pooling_params.verify(task="score")
 
     with pytest.raises(ValueError):
-        pooling_params.verify(task="encode")
+        pooling_params.verify(task="classify")
 
 
 def test_embed():
     task = "embed"
+    model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS"))
+
     pooling_params = PoolingParams(normalize=None)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)
 
     pooling_params = PoolingParams(normalize=True)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)
 
     pooling_params = PoolingParams(normalize=False)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)
 
-    invalid_parameters = ["activation", "softmax"]
+    invalid_parameters = classify_parameters + step_pooling_parameters
     for p in invalid_parameters:
         with pytest.raises(ValueError):
             pooling_params = PoolingParams(**{p: True})
-            pooling_params.verify(task=task)
+            pooling_params.verify(task=task, model_config=model_config)
 
 
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
@@ -73,35 +86,71 @@ def test_embed_dimensions(model_info: EmbedModelInfo):
 
 @pytest.mark.parametrize("task", ["score", "classify"])
 def test_classify(task):
+    model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS"))
+
     pooling_params = PoolingParams(activation=None)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)
 
     pooling_params = PoolingParams(activation=True)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)
 
     pooling_params = PoolingParams(activation=False)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)
 
-    invalid_parameters = ["dimensions", "normalize", "softmax"]
+    invalid_parameters = embed_parameters + step_pooling_parameters
     for p in invalid_parameters:
         with pytest.raises(ValueError):
             pooling_params = PoolingParams(**{p: True})
-            pooling_params.verify(task=task)
+            pooling_params.verify(task=task, model_config=model_config)
 
 
-def test_encode():
-    task = "encode"
-    pooling_params = PoolingParams(softmax=None)
-    pooling_params.verify(task=task)
+@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"])
+def test_token_embed(pooling_type: str):
+    task = "token_embed"
+    model_config = MockModelConfig(
+        pooler_config=PoolerConfig(pooling_type=pooling_type)
+    )
 
-    pooling_params = PoolingParams(softmax=True)
-    pooling_params.verify(task=task)
+    pooling_params = PoolingParams(normalize=None)
+    pooling_params.verify(task=task, model_config=model_config)
 
-    pooling_params = PoolingParams(softmax=False)
-    pooling_params.verify(task=task)
+    pooling_params = PoolingParams(normalize=True)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    pooling_params = PoolingParams(normalize=False)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    invalid_parameters = classify_parameters
+    if pooling_type != "STEP":
+        invalid_parameters = classify_parameters + step_pooling_parameters
 
-    invalid_parameters = ["dimensions", "normalize", "activation"]
     for p in invalid_parameters:
         with pytest.raises(ValueError):
             pooling_params = PoolingParams(**{p: True})
-            pooling_params.verify(task=task)
+            pooling_params.verify(task=task, model_config=model_config)
+
+
+@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"])
+def test_token_classify(pooling_type: str):
+    task = "token_classify"
+    model_config = MockModelConfig(
+        pooler_config=PoolerConfig(pooling_type=pooling_type)
+    )
+
+    pooling_params = PoolingParams(activation=None)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    pooling_params = PoolingParams(activation=True)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    pooling_params = PoolingParams(activation=False)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    invalid_parameters = embed_parameters
+    if pooling_type != "STEP":
+        invalid_parameters = embed_parameters + step_pooling_parameters
+
+    for p in invalid_parameters:
+        with pytest.raises(ValueError):
+            pooling_params = PoolingParams(**{p: True})
+            pooling_params.verify(task=task, model_config=model_config)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 61376736d0f7a..e2db9d049a758 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -951,7 +951,7 @@ class LLM:
         truncate_prompt_tokens: int | None = None,
         use_tqdm: bool | Callable[..., tqdm] = True,
         lora_request: list[LoRARequest] | LoRARequest | None = None,
-        pooling_task: PoolingTask = "encode",
+        pooling_task: PoolingTask | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[PoolingRequestOutput]:
         """Apply pooling to the hidden states corresponding to the input
@@ -986,25 +986,24 @@ class LLM:
             instead pass them via the `inputs` parameter.
         """
 
-        if self.supported_tasks == ["encode"] and pooling_task is None:
-            pooling_task = "encode"
+        error_str = (
+            "pooling_task required for `LLM.encode`\n"
+            "Please use one of the more specific methods or set the "
+            "pooling_task when using `LLM.encode`:\n"
+            "  - For embeddings, use `LLM.embed(...)` "
+            'or `pooling_task="embed"`.\n'
+            "  - For classification logits, use `LLM.classify(...)` "
+            'or `pooling_task="classify"`.\n'
+            "  - For similarity scores, use `LLM.score(...)`.\n"
+            "  - For rewards, use `LLM.reward(...)` "
+            'or `pooling_task="token_classify"`\n'
+            "  - For token classification, "
+            'use `pooling_task="token_classify"`\n'
+            '  - For multi-vector retrieval, use `pooling_task="token_embed"`'
+        )
 
         if pooling_task is None:
-            pooling_task = "embed" if "embed" in self.supported_tasks else "encode"
-
-            logger.warning_once(
-                "`LLM.encode` is currently using `pooling_task = %s`.\n"
-                "Please use one of the more specific methods or set the "
-                "task directly when using `LLM.encode`:\n"
-                "  - For embeddings, use `LLM.embed(...)` "
-                'or `pooling_task="embed"`.\n'
-                "  - For classification logits, use `LLM.classify(...)` "
-                'or `pooling_task="classify"`.\n'
-                "  - For rewards, use `LLM.reward(...)` "
-                'or `pooling_task="reward"`\n'
-                "  - For similarity scores, use `LLM.score(...)`.",
-                pooling_task,
-            )
+            raise ValueError(error_str)
 
         model_config = self.model_config
         runner_type = model_config.runner_type
@@ -1206,7 +1205,7 @@ class LLM:
             lora_request=lora_request,
             pooling_params=pooling_params,
             truncate_prompt_tokens=truncate_prompt_tokens,
-            pooling_task="encode",
+            pooling_task="token_classify",
         )
 
     def _embedding_score(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index fd80ba7a9afca..0ac0355956908 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1748,16 +1748,19 @@ async def init_app_state(
         else None
     )
     state.openai_serving_pooling = (
-        OpenAIServingPooling(
-            engine_client,
-            state.openai_serving_models,
-            request_logger=request_logger,
-            chat_template=resolved_chat_template,
-            chat_template_content_format=args.chat_template_content_format,
-            trust_request_chat_template=args.trust_request_chat_template,
-            log_error_stack=args.log_error_stack,
+        (
+            OpenAIServingPooling(
+                engine_client,
+                state.openai_serving_models,
+                supported_tasks=supported_tasks,
+                request_logger=request_logger,
+                chat_template=resolved_chat_template,
+                chat_template_content_format=args.chat_template_content_format,
+                trust_request_chat_template=args.trust_request_chat_template,
+                log_error_stack=args.log_error_stack,
+            )
         )
-        if "encode" in supported_tasks
+        if ("token_embed" in supported_tasks or "token_classify" in supported_tasks)
         else None
     )
     state.openai_serving_embedding = (
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 86e1e62ff437b..5b8a118280da3 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1682,7 +1682,7 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
     When using plugins IOProcessor plugins, the actual input is processed
     by the plugin itself. Hence, we use a generic type for the request data
     """
-    softmax: bool = True
+    activation: bool = False
 
     embed_dtype: str = Field(
         default="float32",
@@ -1693,7 +1693,7 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
     )
 
     def to_pooling_params(self):
-        return PoolingParams(task="encode", softmax=self.softmax)
+        return PoolingParams(task="token_classify", activation=self.activation)
 
 
 class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 3ed17abe09464..aa81a233b2979 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -35,6 +35,7 @@ from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
+from vllm.tasks import SupportedTask
 from vllm.utils import merge_async_iterators
 
 logger = init_logger(__name__)
@@ -62,6 +63,7 @@ class OpenAIServingPooling(OpenAIServing):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
+        supported_tasks: tuple[SupportedTask, ...],
         request_logger: RequestLogger | None,
         chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
@@ -75,6 +77,7 @@ class OpenAIServingPooling(OpenAIServing):
             log_error_stack=log_error_stack,
         )
 
+        self.supported_tasks = supported_tasks
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
         self.trust_request_chat_template = trust_request_chat_template
@@ -178,8 +181,17 @@ class OpenAIServingPooling(OpenAIServing):
         try:
             pooling_params = request.to_pooling_params()
 
+            if "token_embed" in self.supported_tasks:
+                pooling_task = "token_embed"
+            elif "token_classify" in self.supported_tasks:
+                pooling_task = "token_classify"
+            else:
+                return self.create_error_response(
+                    f"pooling_task must be one of {self.supported_tasks}."
+                )
+
             try:
-                pooling_params.verify("encode", self.model_config)
+                pooling_params.verify(pooling_task, self.model_config)
             except ValueError as e:
                 return self.create_error_response(str(e))
 
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 010c607bcabf7..84e176f0ea89f 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -64,66 +64,6 @@ class PoolingParamsUpdate:
         params.requires_token_ids = self.requires_token_ids
 
 
-class Pooler(nn.Module, ABC):
-    """The interface required for all poolers used in pooling models in vLLM."""
-
-    @staticmethod
-    def for_encode(pooler_config: PoolerConfig):
-        if pooler_config.pooling_type == "STEP":
-            return StepPooler()
-
-        resolved_config = ResolvedPoolingConfig(
-            task="encode", pooling_type=PoolingType.ALL
-        )
-
-        return SimplePooler.from_config(resolved_config)
-
-    @staticmethod
-    def for_embed(pooler_config: PoolerConfig):
-        resolved_config = ResolvedPoolingConfig.from_config(
-            task="embed",
-            pooler_config=pooler_config,
-        )
-
-        return SimplePooler.from_config(resolved_config)
-
-    @staticmethod
-    def for_classify(
-        pooler_config: PoolerConfig,
-        classifier: ClassifierFn | None,
-    ):
-        resolved_config = ResolvedPoolingConfig.from_config(
-            task="classify",
-            pooler_config=pooler_config,
-        )
-
-        pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type)
-
-        return ClassifierPooler(
-            pooling=pooling,
-            classifier=classifier,
-        )
-
-    @abstractmethod
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        """Determine which pooling tasks are supported."""
-        raise NotImplementedError
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        """
-        Construct the updated pooling parameters to use for a supported task.
-        """
-        return PoolingParamsUpdate()
-
-    @abstractmethod
-    def forward(
-        self,
-        hidden_states: list[torch.Tensor] | torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        raise NotImplementedError
-
-
 def get_prompt_lens(
     hidden_states: torch.Tensor | list[torch.Tensor],
     pooling_metadata: PoolingMetadata,
@@ -237,7 +177,7 @@ class PoolingMethod(nn.Module, ABC):
 
 class CLSPool(PoolingMethod):
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode", "embed", "classify", "score"}
+        return {"token_embed", "token_classify", "embed", "classify", "score"}
 
     def forward_all(
         self,
@@ -253,7 +193,7 @@ class CLSPool(PoolingMethod):
 
 class LastPool(PoolingMethod):
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode", "embed", "classify", "score"}
+        return {"token_embed", "token_classify", "embed", "classify", "score"}
 
     def forward_all(
         self,
@@ -265,7 +205,7 @@ class LastPool(PoolingMethod):
 
 class AllPool(PoolingMethod):
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode"}
+        return {"token_embed", "token_classify"}
 
     def forward_all(
         self,
@@ -284,7 +224,7 @@ class AllPool(PoolingMethod):
 
 class MeanPool(PoolingMethod):
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode", "embed", "classify", "score"}
+        return {"token_embed", "token_classify", "embed", "classify", "score"}
 
     def forward_all(
         self,
@@ -398,6 +338,82 @@ class LambdaPoolerActivation(PoolerActivation):
         return self.fn(pooled_data)
 
 
+class Pooler(nn.Module, ABC):
+    """The interface required for all poolers used in pooling models in vLLM."""
+
+    @staticmethod
+    def for_token_embed(pooler_config: PoolerConfig):
+        head = TokenEmbeddingPoolerHead()
+
+        if pooler_config.pooling_type == "STEP":
+            return StepPooler(head=head)
+
+        return AllPooler(head=head)
+
+    @staticmethod
+    def for_token_classify(
+        pooler_config: PoolerConfig,
+        classifier: ClassifierFn | None = None,
+        act_fn: PoolerActivation | str | None = None,
+    ):
+        head = TokenClassifierPoolerHead(classifier=classifier, act_fn=act_fn)
+
+        if pooler_config.pooling_type == "STEP":
+            return StepPooler(head=head)
+
+        return AllPooler(head=head)
+
+    @staticmethod
+    def for_embed(pooler_config: PoolerConfig):
+        resolved_config = ResolvedPoolingConfig.from_config(
+            task="embed",
+            pooler_config=pooler_config,
+        )
+
+        pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type)
+        head = EmbeddingPoolerHead()
+
+        return SimplePooler(pooling=pooling, head=head)
+
+    @staticmethod
+    def for_classify(
+        pooler_config: PoolerConfig,
+        classifier: ClassifierFn | None,
+        act_fn: PoolerActivation | str | None = None,
+    ):
+        resolved_config = ResolvedPoolingConfig.from_config(
+            task="classify",
+            pooler_config=pooler_config,
+        )
+
+        pooling = PoolingMethod.from_pooling_type(resolved_config.pooling_type)
+
+        return ClassifierPooler(
+            pooling=pooling,
+            classifier=classifier,
+            act_fn=act_fn,
+        )
+
+    @abstractmethod
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        """Determine which pooling tasks are supported."""
+        raise NotImplementedError
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        """
+        Construct the updated pooling parameters to use for a supported task.
+        """
+        return PoolingParamsUpdate()
+
+    @abstractmethod
+    def forward(
+        self,
+        hidden_states: list[torch.Tensor] | torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        raise NotImplementedError
+
+
 class PoolerHead(nn.Module):
     def __init__(self, activation: PoolerActivation) -> None:
         super().__init__()
@@ -416,7 +432,6 @@ class EmbeddingPoolerHead(PoolerHead):
         super().__init__(activation=PoolerNormalize())
 
         # Load ST projector if available
-
         vllm_config = get_current_vllm_config()
         self.projector: nn.Module | None = (
             _load_st_projector(vllm_config.model_config) if vllm_config else None
@@ -471,39 +486,6 @@ class EmbeddingPoolerHead(PoolerHead):
         return pooled_data
 
 
-class RewardPoolerHead(PoolerHead):
-    def __init__(self) -> None:
-        super().__init__(activation=PoolerClassify(static_num_labels=False))
-
-        vllm_config = get_current_vllm_config()
-        self.head_dtype = vllm_config.model_config.head_dtype
-
-    def forward(
-        self,
-        pooled_data: list[torch.Tensor] | torch.Tensor,
-        pooling_metadata: PoolingMetadata,
-    ):
-        if isinstance(pooled_data, list):
-            pooled_data = [p.to(self.head_dtype) for p in pooled_data]
-        else:
-            pooled_data = pooled_data.to(self.head_dtype)
-
-        pooling_params = get_pooling_params(pooling_metadata)
-
-        # for softmax
-        flags = [p.softmax for p in pooling_params]
-        if len(set(flags)) == 1:
-            if flags[0]:
-                pooled_data = self.activation(pooled_data)
-        else:
-            pooled_data = [
-                self.activation(vecs) if f else vecs
-                for vecs, f in zip(pooled_data, flags)
-            ]
-
-        return pooled_data
-
-
 class SimplePooler(Pooler):
     """A layer that pools specific information from hidden states.
 
@@ -513,20 +495,6 @@ class SimplePooler(Pooler):
     3. Returns structured results as `PoolerOutput`.
     """
 
-    @classmethod
-    def from_config(
-        cls,
-        pooler_config: ResolvedPoolingConfig,
-    ) -> "SimplePooler":
-        pooling = PoolingMethod.from_pooling_type(pooler_config.pooling_type)
-        if pooler_config.task == "embed":
-            head = EmbeddingPoolerHead()
-        elif pooler_config.task == "encode":
-            head = RewardPoolerHead()
-        else:
-            raise NotImplementedError(f"Unknown task: {pooler_config.task}")
-        return cls(pooling, head)
-
     def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None:
         super().__init__()
 
@@ -549,58 +517,6 @@ class SimplePooler(Pooler):
         return pooled_data
 
 
-class StepPooler(Pooler):
-    def __init__(
-        self,
-    ) -> None:
-        super().__init__()
-
-        self.pooling = AllPool()
-        self.head = RewardPoolerHead()
-
-    def extract_states(
-        self,
-        hidden_states: torch.Tensor | list[torch.Tensor],
-        pooling_metadata: PoolingMetadata,
-    ) -> list[torch.Tensor] | torch.Tensor:
-        pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
-        prompt_token_ids = get_prompt_token_ids(pooling_metadata)
-
-        pooled_data = list[torch.Tensor]()
-
-        pooling_params = get_pooling_params(pooling_metadata)
-
-        for data, token_id, pooling_param in zip(
-            pooled_data_lst, prompt_token_ids, pooling_params
-        ):
-            step_tag_id = pooling_param.step_tag_id
-            returned_token_ids = pooling_param.returned_token_ids
-
-            if returned_token_ids is not None and len(returned_token_ids) > 0:
-                data = data[:, returned_token_ids]
-
-            if step_tag_id is not None:
-                data = data[token_id == step_tag_id]
-            pooled_data.append(data)
-
-        return pooled_data
-
-    def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"encode"}
-
-    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
-        return PoolingParamsUpdate(requires_token_ids=True)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor | list[torch.Tensor],
-        pooling_metadata: PoolingMetadata,
-    ) -> PoolerOutput:
-        pooled_data = self.extract_states(hidden_states, pooling_metadata)
-        pooled_data = self.head(pooled_data, pooling_metadata)
-        return pooled_data
-
-
 class ClassifierPooler(Pooler):
     """A pooling layer for classification tasks.
 
@@ -611,26 +527,46 @@ class ClassifierPooler(Pooler):
     """
 
     @staticmethod
-    def act_fn_for_seq_cls(config: ModelConfig):
-        return get_classification_activation_function(config.hf_config)
+    def act_fn_for_seq_cls(model_config: ModelConfig):
+        return get_classification_activation_function(model_config.hf_config)
 
     @staticmethod
-    def act_fn_for_cross_encoder(config: ModelConfig):
-        return get_cross_encoder_activation_function(config.hf_config)
+    def act_fn_for_cross_encoder(model_config: ModelConfig):
+        return get_cross_encoder_activation_function(model_config.hf_config)
+
+    @staticmethod
+    def resolve_act_fn(
+        model_config: ModelConfig,
+        static_num_labels: bool = True,
+        act_fn: PoolerActivation | str | None = None,
+    ):
+        if isinstance(act_fn, str):
+            if act_fn == "classify":
+                return ClassifierPooler.act_fn_for_seq_cls(model_config)
+            elif act_fn == "score":
+                return ClassifierPooler.act_fn_for_cross_encoder(model_config)
+            else:
+                raise ValueError(f"act_fn [{act_fn=}] not supported.")
+        elif act_fn is None:
+            return PoolerClassify(static_num_labels=static_num_labels)
+        else:
+            assert callable(act_fn)
+            return act_fn
 
     def __init__(
         self,
         pooling: PoolingFn,
         classifier: ClassifierFn | None,
-        act_fn: PoolerActivation | None = None,
+        act_fn: PoolerActivation | str | None = None,
     ) -> None:
         super().__init__()
 
         vllm_config = get_current_vllm_config()
-
         self.pooling = pooling
         self.classifier = classifier
-        self.act_fn = act_fn or PoolerClassify()
+        self.act_fn = self.resolve_act_fn(
+            vllm_config.model_config, static_num_labels=True, act_fn=act_fn
+        )
         self.logit_bias: float | None = (
             vllm_config.model_config.pooler_config.logit_bias
         )
@@ -672,6 +608,150 @@ class ClassifierPooler(Pooler):
         return scores
 
 
+class TokenEmbeddingPoolerHead(EmbeddingPoolerHead):
+    def forward(
+        self, pooled_data: torch.Tensor, pooling_param: PoolingParams
+    ) -> torch.Tensor:
+        pooled_data = pooled_data.to(self.head_dtype)
+        # pooled_data shape: [n_tokens, hidden_dimension]
+
+        # Apply ST projector
+        if self.projector is not None:
+            pooled_data = self.projector(pooled_data)
+        # pooled_data shape: [n_tokens, embedding_dimension]
+
+        # for matryoshka representation
+        pooled_data = pooled_data[..., : pooling_param.dimensions]
+
+        # for normalize
+        if pooling_param.normalize:
+            pooled_data = self.activation(pooled_data)
+
+        # pooled_data shape: [n_tokens, embedding_dimension]
+        return pooled_data
+
+
+class TokenClassifierPoolerHead(nn.Module):
+    def __init__(
+        self,
+        classifier: ClassifierFn | None,
+        act_fn: PoolerActivation | str | None = None,
+    ) -> None:
+        super().__init__()
+        vllm_config = get_current_vllm_config()
+
+        self.classifier = classifier
+        self.act_fn = ClassifierPooler.resolve_act_fn(
+            vllm_config.model_config, static_num_labels=False, act_fn=act_fn
+        )
+        self.logit_bias: float | None = (
+            vllm_config.model_config.pooler_config.logit_bias
+        )
+        self.head_dtype = vllm_config.model_config.head_dtype
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"token_classify"}
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_param: PoolingParams,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states.to(self.head_dtype)
+        # hidden_states shape: [n_token, hidden_size]
+
+        if self.classifier is not None:
+            scores = self.classifier(hidden_states)
+        else:
+            scores = hidden_states
+        # scores shape: [n_token, num_labels]
+
+        if self.logit_bias is not None:
+            scores -= self.logit_bias
+
+        if pooling_param.activation:
+            scores = self.act_fn(scores)
+
+        # scores shape: [n_token, num_labels]
+        return scores
+
+
+class AllPooler(Pooler):
+    def __init__(self, head: nn.Module | PoolerHead) -> None:
+        super().__init__()
+
+        self.pooling = AllPool()
+        self.head = head
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"token_embed", "token_classify"}
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.pooling(hidden_states, pooling_metadata)
+        pooling_params = get_pooling_params(pooling_metadata)
+        assert len(pooled_data) == len(pooling_params)
+
+        pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
+        return pooled_data
+
+
+class StepPooler(Pooler):
+    def __init__(self, head: nn.Module | PoolerHead) -> None:
+        super().__init__()
+
+        self.pooling = AllPool()
+        self.head = head
+
+    def extract_states(
+        self,
+        hidden_states: torch.Tensor | list[torch.Tensor],
+        pooling_metadata: PoolingMetadata,
+    ) -> torch.Tensor | list[torch.Tensor]:
+        pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
+        prompt_token_ids = get_prompt_token_ids(pooling_metadata)
+
+        pooled_data = list[torch.Tensor]()
+
+        pooling_params = get_pooling_params(pooling_metadata)
+
+        for data, token_id, pooling_param in zip(
+            pooled_data_lst, prompt_token_ids, pooling_params
+        ):
+            step_tag_id = pooling_param.step_tag_id
+            returned_token_ids = pooling_param.returned_token_ids
+
+            if returned_token_ids is not None and len(returned_token_ids) > 0:
+                data = data[:, returned_token_ids]
+
+            if step_tag_id is not None:
+                data = data[token_id == step_tag_id]
+            pooled_data.append(data)
+
+        return pooled_data
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"token_embed", "token_classify"}
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate(requires_token_ids=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor | list[torch.Tensor],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.extract_states(hidden_states, pooling_metadata)
+        pooling_params = get_pooling_params(pooling_metadata)
+        assert len(pooled_data) == len(pooling_params)
+
+        pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
+        return pooled_data
+
+
 class DispatchPooler(Pooler):
     """Dispatches calls to a sub-pooler based on the pooling task."""
 
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 1d3874b164845..5d51cd3757414 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -250,7 +250,7 @@ def as_embedding_model(cls: _T) -> _T:
 
             self.pooler = DispatchPooler(
                 {
-                    "encode": Pooler.for_encode(pooler_config),
+                    "token_embed": Pooler.for_token_embed(pooler_config),
                     "embed": Pooler.for_embed(pooler_config),
                 },
             )
@@ -279,11 +279,8 @@ def as_seq_cls_model(cls: _T) -> _T:
     # Lazy import
     from vllm.model_executor.layers.linear import ReplicatedLinear
     from vllm.model_executor.layers.pooler import (
-        ClassifierPooler,
         DispatchPooler,
         Pooler,
-        PoolingMethod,
-        PoolingType,
     )
     from vllm.model_executor.models.interfaces import SupportsCrossEncoding
     from vllm.sequence import IntermediateTensors
@@ -302,42 +299,29 @@ def as_seq_cls_model(cls: _T) -> _T:
                 model_config.hidden_size,
                 config.num_labels,
                 bias=False,
-                params_dtype=torch.float32,
+                params_dtype=vllm_config.model_config.head_dtype,
                 quant_config=quant_config,
+                return_bias=False,
                 prefix=maybe_prefix(prefix, "score"),
             )
 
             pooler_config = vllm_config.model_config.pooler_config
             assert pooler_config is not None
 
-            pooling_type_str = pooler_config.pooling_type
-            assert pooling_type_str is not None
-            pooling_type = PoolingType[pooling_type_str]
-
             self.pooler = DispatchPooler(
                 {
-                    "encode": Pooler.for_encode(pooler_config),
-                    "classify": ClassifierPooler(
-                        pooling=PoolingMethod.from_pooling_type(pooling_type),
-                        classifier=self._classifier,
-                        act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                            vllm_config.model_config
-                        ),
+                    "token_classify": Pooler.for_token_classify(
+                        pooler_config, classifier=self.score
                     ),
-                    "score": ClassifierPooler(
-                        pooling=PoolingMethod.from_pooling_type(pooling_type),
-                        classifier=self._classifier,
-                        act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                            vllm_config.model_config
-                        ),
+                    "classify": Pooler.for_classify(
+                        pooler_config, classifier=self.score, act_fn="classify"
+                    ),
+                    "score": Pooler.for_classify(
+                        pooler_config, classifier=self.score, act_fn="score"
                     ),
                 }
             )
 
-        def _classifier(self, x: torch.Tensor):
-            x, _ = self.score(x.float())
-            return x
-
         def forward(
             self,
             input_ids: torch.Tensor,
@@ -393,7 +377,11 @@ def as_reward_model(cls: _T) -> _T:
             assert pooler_config is not None
 
             self.pooler = DispatchPooler(
-                {"encode": Pooler.for_encode(pooler_config)},
+                {
+                    "token_classify": Pooler.for_token_classify(
+                        pooler_config=pooler_config
+                    )
+                }
             )
 
     ModelForReward.__name__ = _get_pooling_model_name(cls.__name__, "ForReward")
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 6e81eb8dc91b3..1c2334a785437 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -521,7 +521,7 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
     def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
         return DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_embed": Pooler.for_token_embed(pooler_config),
                 "embed": Pooler.for_embed(pooler_config),
             }
         )
@@ -724,7 +724,7 @@ class BertSpladeSparseEmbeddingModel(BertEmbeddingModel):
 
         return DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_embed": Pooler.for_token_embed(pooler_config),
                 "embed": SPLADESparsePooler(
                     mlm_head=self.mlm_head,
                     cls_token_id=cls_id,
@@ -821,20 +821,16 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQu
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.classifier
+                ),
                 "classify": ClassifierPooler(
                     pooling=self.bert.pooler,
                     classifier=self.classifier,
-                    act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                        vllm_config.model_config
-                    ),
+                    act_fn="classify",
                 ),
                 "score": ClassifierPooler(
-                    pooling=self.bert.pooler,
-                    classifier=self.classifier,
-                    act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                        vllm_config.model_config
-                    ),
+                    pooling=self.bert.pooler, classifier=self.classifier, act_fn="score"
                 ),
             }
         )
@@ -891,7 +887,9 @@ class BertForTokenClassification(nn.Module):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config=pooler_config
+                ),
             }
         )
 
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 49111dd9ffab5..31fdc4d21245a 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -695,20 +695,16 @@ class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.classifier
+                ),
                 "classify": ClassifierPooler(
                     pooling=self.new.pooler,
                     classifier=self.classifier,
-                    act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                        vllm_config.model_config
-                    ),
+                    act_fn="classify",
                 ),
                 "score": ClassifierPooler(
-                    pooling=self.new.pooler,
-                    classifier=self.classifier,
-                    act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                        vllm_config.model_config
-                    ),
+                    pooling=self.new.pooler, classifier=self.classifier, act_fn="score"
                 ),
             }
         )
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 3d7b28af8bdbe..27953c27188d9 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -837,7 +837,7 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_embed": Pooler.for_token_embed(pooler_config),
                 "embed": Pooler.for_embed(pooler_config),
             }
         )
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index ddd6e53b4a436..6d99d02a32be2 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -353,8 +353,15 @@ class GPT2ForSequenceClassification(nn.Module, SupportsCrossEncoding):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
-                "classify": Pooler.for_classify(pooler_config, classifier=self.score),
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.score
+                ),
+                "classify": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="classify"
+                ),
+                "score": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="score"
+                ),
             }
         )
 
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index ede3e34881b17..181c4ed2dca5a 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -239,7 +239,7 @@ class GritLM(LlamaForCausalLM):
         if pooler_config is not None:
             self.pooler = DispatchPooler(
                 {
-                    "encode": Pooler.for_encode(pooler_config),
+                    "token_embed": Pooler.for_token_embed(pooler_config),
                     "embed": GritLMPooler(vllm_config.model_config),
                 }
             )
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 8d83a1478dff9..c5bbd5497a146 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -444,7 +444,7 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
         assert pooler_config is not None
 
         self.pooler = DispatchPooler(
-            {"encode": Pooler.for_encode(pooler_config)},
+            {"token_classify": Pooler.for_token_classify(pooler_config)}
         )
 
     def forward(
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 49cb9311a786d..f8a87cf6965f8 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -604,10 +604,14 @@ class JambaForSequenceClassification(JambaForCausalLM):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.score
+                ),
                 "classify": Pooler.for_classify(
-                    pooler_config,
-                    classifier=self.score,
+                    pooler_config, classifier=self.score, act_fn="classify"
+                ),
+                "score": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="score"
                 ),
             }
         )
diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py
index a9333155243d4..05a40837954d8 100644
--- a/vllm/model_executor/models/jina_vl.py
+++ b/vllm/model_executor/models/jina_vl.py
@@ -97,9 +97,15 @@ class JinaVLForSequenceClassification(
         self.score = JinaVLScorer(vllm_config.model_config)
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
-                "classify": Pooler.for_classify(pooler_config, classifier=self.score),
-                "score": Pooler.for_classify(pooler_config, classifier=self.score),
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.score
+                ),
+                "classify": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="classify"
+                ),
+                "score": Pooler.for_classify(
+                    pooler_config, classifier=self.score, act_fn="score"
+                ),
             }
         )
 
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index ff9f6a41ab994..5dbf38c69086f 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -322,20 +322,14 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.classifier
+                ),
                 "classify": ClassifierPooler(
-                    pooling=self.pooling,
-                    classifier=self.classifier,
-                    act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                        vllm_config.model_config
-                    ),
+                    pooling=self.pooling, classifier=self.classifier, act_fn="classify"
                 ),
                 "score": ClassifierPooler(
-                    pooling=self.pooling,
-                    classifier=self.classifier,
-                    act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                        vllm_config.model_config
-                    ),
+                    pooling=self.pooling, classifier=self.classifier, act_fn="score"
                 ),
             }
         )
@@ -421,7 +415,9 @@ class ModernBertForTokenClassification(nn.Module):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config=pooler_config
+                ),
             }
         )
 
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index c2f2ba637f090..e2ba0e262cf79 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -107,7 +107,7 @@ class Qwen2ForRewardModel(Qwen2RewardBaseModel):
         assert pooler_config is not None
 
         self.pooler = DispatchPooler(
-            {"encode": Pooler.for_encode(pooler_config)},
+            {"token_classify": Pooler.for_token_classify(pooler_config)}
         )
 
 
@@ -120,4 +120,6 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
         pooler_config = vllm_config.model_config.pooler_config
         assert pooler_config is not None
 
-        self.pooler = DispatchPooler({"encode": Pooler.for_encode(pooler_config)})
+        self.pooler = DispatchPooler(
+            {"token_classify": Pooler.for_token_classify(pooler_config)}
+        )
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 456226360b91b..cfccb904f46c9 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -105,15 +105,7 @@ class RobertaClassificationHead(nn.Module):
 
 @default_pooling_type("CLS")
 class RobertaEmbeddingModel(BertEmbeddingModel):
-    """A model that uses Roberta to provide embedding functionalities.
-
-    This class encapsulates the BertModel and provides an interface for
-    embedding operations and customized pooling functions.
-
-    Attributes:
-        model: An instance of BertModel used for forward operations.
-        _pooler: An instance of Pooler used for pooling operations.
-    """
+    """A model that uses Roberta to provide embedding functionalities."""
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
@@ -212,20 +204,14 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config=pooler_config, classifier=self.classifier
+                ),
                 "classify": ClassifierPooler(
-                    pooling=CLSPool(),
-                    classifier=self.classifier,
-                    act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                        vllm_config.model_config
-                    ),
+                    pooling=CLSPool(), classifier=self.classifier, act_fn="classify"
                 ),
                 "score": ClassifierPooler(
-                    pooling=CLSPool(),
-                    classifier=self.classifier,
-                    act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                        vllm_config.model_config
-                    ),
+                    pooling=CLSPool(), classifier=self.classifier, act_fn="score"
                 ),
             }
         )
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index e8506666db5bc..0252705c62b13 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -250,7 +250,7 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
         assert pooler_config is not None
 
         self.pooler = DispatchPooler(
-            {"encode": Pooler.for_encode(pooler_config)},
+            {"token_classify": Pooler.for_token_classify(pooler_config)}
         )
 
     def get_input_embeddings(
diff --git a/vllm/model_executor/models/transformers_pooling.py b/vllm/model_executor/models/transformers_pooling.py
index 411fb92e9460b..7ddeb403da448 100644
--- a/vllm/model_executor/models/transformers_pooling.py
+++ b/vllm/model_executor/models/transformers_pooling.py
@@ -135,7 +135,7 @@ class TransformersEmbeddingModel(TransformersPoolingBase):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_embed": Pooler.for_token_embed(pooler_config),
                 "embed": Pooler.for_embed(pooler_config),
             }
         )
@@ -190,20 +190,14 @@ class TransformersForSequenceClassification(TransformersPoolingBase):
 
         self.pooler = DispatchPooler(
             {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_classify": Pooler.for_token_classify(
+                    pooler_config, classifier=self.classifier
+                ),
                 "classify": ClassifierPooler(
-                    pooling=CLSPool(),
-                    classifier=self.classifier,
-                    act_fn=ClassifierPooler.act_fn_for_seq_cls(
-                        vllm_config.model_config
-                    ),
+                    pooling=CLSPool(), classifier=self.classifier, act_fn="classify"
                 ),
                 "score": ClassifierPooler(
-                    pooling=CLSPool(),
-                    classifier=self.classifier,
-                    act_fn=ClassifierPooler.act_fn_for_cross_encoder(
-                        vllm_config.model_config
-                    ),
+                    pooling=CLSPool(), classifier=self.classifier, act_fn="score"
                 ),
             }
         )
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 175a4ac01b83e..c6dff6e01c1d6 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -10,7 +10,7 @@ from vllm.sampling_params import RequestOutputKind
 from vllm.tasks import PoolingTask
 
 if TYPE_CHECKING:
-    from vllm.config import ModelConfig
+    from vllm.config import ModelConfig, PoolerConfig
 
 
 class PoolingParams(
@@ -30,7 +30,6 @@ class PoolingParams(
             if model support matryoshka representation.
         activation: Whether to apply activation function to
             the classification outputs.
-        softmax: Whether to apply softmax to the reward outputs.
     """
 
     # --8<-- [start:common-pooling-params]
@@ -48,32 +47,19 @@ class PoolingParams(
     activation: bool | None = None
     # --8<-- [end:classification-pooling-params]
 
-    ## for reward models
-    softmax: bool | None = None
+    ## for step pooling models
     step_tag_id: int | None = None
     returned_token_ids: list[int] | None = None
 
+    ## Internal use only
     task: PoolingTask | None = None
-    """Internal use only."""
-
     requires_token_ids: bool = False
-    """Internal use only."""
-
     extra_kwargs: dict[str, Any] | None = None
-    """Internal use only."""
-
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
 
     @property
     def all_parameters(self) -> list[str]:
-        return [
-            "dimensions",
-            "normalize",
-            "activation",
-            "softmax",
-            "step_tag_id",
-            "returned_token_ids",
-        ]
+        return ["dimensions", "normalize", "activation"]
 
     @property
     def valid_parameters(self):
@@ -81,7 +67,8 @@ class PoolingParams(
             "embed": ["dimensions", "normalize"],
             "classify": ["activation"],
             "score": ["activation"],
-            "encode": ["softmax", "step_tag_id", "returned_token_ids"],
+            "token_embed": ["dimensions", "normalize"],
+            "token_classify": ["activation"],
         }
 
     def clone(self) -> "PoolingParams":
@@ -100,7 +87,6 @@ class PoolingParams(
         # NOTE: Task validation needs to done against the model instance,
         # which is not available in model config. So, it's not included
         # in this method
-
         self._merge_default_parameters(model_config)
         self._set_default_parameters(model_config)
         self._verify_valid_parameters()
@@ -125,8 +111,34 @@ class PoolingParams(
             if getattr(self, k, None) is None:
                 setattr(self, k, getattr(pooler_config, k))
 
+        self._verify_step_pooling(pooler_config, valid_parameters)
+
+    def _verify_step_pooling(
+        self, pooler_config: "PoolerConfig", valid_parameters: list[str]
+    ):
+        step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
+        if pooler_config.pooling_type != "STEP":
+            invalid_parameters = []
+            for k in step_pooling_parameters:
+                if getattr(self, k, None) is not None:
+                    invalid_parameters.append(k)
+
+            if invalid_parameters:
+                raise ValueError(
+                    f"Task {self.task} only supports {valid_parameters} "
+                    f"parameters, does not support "
+                    f"{invalid_parameters} parameters"
+                )
+        else:
+            for k in step_pooling_parameters:
+                if getattr(pooler_config, k, None) is None:
+                    continue
+
+                if getattr(self, k, None) is None:
+                    setattr(self, k, getattr(pooler_config, k))
+
     def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
-        if self.task == "embed":
+        if self.task in ["embed", "token_embed"]:
             if self.normalize is None:
                 self.normalize = True
 
@@ -150,13 +162,9 @@ class PoolingParams(
                 elif self.dimensions < 1:
                     raise ValueError("Dimensions must be greater than 0")
 
-        elif self.task in ["classify", "score"]:
+        elif self.task in ["classify", "score", "token_classify"]:
             if self.activation is None:
                 self.activation = True
-
-        elif self.task == "encode":
-            if self.softmax is None:
-                self.softmax = True
         else:
             raise ValueError(f"Unknown pooling task: {self.task}")
 
@@ -185,7 +193,6 @@ class PoolingParams(
             f"normalize={self.normalize}, "
             f"dimensions={self.dimensions}, "
             f"activation={self.activation}, "
-            f"softmax={self.softmax}, "
             f"step_tag_id={self.step_tag_id}, "
             f"returned_token_ids={self.returned_token_ids}, "
             f"requires_token_ids={self.requires_token_ids}, "
diff --git a/vllm/tasks.py b/vllm/tasks.py
index 85c5c6e436205..6551444d17109 100644
--- a/vllm/tasks.py
+++ b/vllm/tasks.py
@@ -5,7 +5,7 @@ from typing import Literal, get_args
 GenerationTask = Literal["generate", "transcription"]
 GENERATION_TASKS = get_args(GenerationTask)
 
-PoolingTask = Literal["encode", "embed", "classify", "score"]
+PoolingTask = Literal["embed", "classify", "score", "token_embed", "token_classify"]
 POOLING_TASKS = get_args(PoolingTask)
 
 SupportedTask = Literal[GenerationTask, PoolingTask]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d995a609318cd..9e394dbb592ec 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1926,15 +1926,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         supported_tasks = list(model.pooler.get_supported_tasks())
 
-        if (
-            self.scheduler_config.chunked_prefill_enabled
-            and "encode" in supported_tasks
-        ):
-            supported_tasks.remove("encode")
+        if self.scheduler_config.chunked_prefill_enabled:
+            if "token_embed" in supported_tasks:
+                supported_tasks.remove("token_embed")
+            if "token_classify" in supported_tasks:
+                supported_tasks.remove("token_classify")
 
             logger.debug_once(
                 "Chunked prefill is not supported with "
-                "encode task which using ALL pooling. "
+                "token_embed and token_classify tasks "
+                "which using ALL pooling. "
                 "Please turn off chunked prefill by "
                 "`--no-enable-chunked-prefill` before using it."
             )

From f93e348010d202297825efb25fa4138913bdfea4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 15 Oct 2025 20:09:03 +0800
Subject: [PATCH 35/51] [Misc] Remove `isort` and `yapf` ignores (#26888)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/v1/engine/conftest.py                                  | 3 +--
 tests/v1/tpu/test_topk_topp_sampler.py                       | 3 ---
 .../quantization/compressed_tensors/schemes/__init__.py      | 1 +
 vllm/model_executor/models/qwen3_omni_moe_thinker.py         | 5 -----
 4 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index c5c5d35b83c3e..283a76dab6723 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -6,6 +6,7 @@ import torch
 from transformers import AutoTokenizer
 
 from tests.v1.engine.utils import (
+    FULL_STRINGS,
     NUM_PROMPT_LOGPROBS_UNDER_TEST,
     NUM_SAMPLE_LOGPROBS_UNDER_TEST,
     PROMPT_LEN,
@@ -18,8 +19,6 @@ from vllm.engine.arg_utils import EngineArgs
 
 from ...distributed.conftest import publisher_config, random_port  # noqa: F401
 
-from tests.v1.engine.utils import FULL_STRINGS  # isort: skip
-
 EngineCoreSampleLogprobsType = list[tuple[torch.Tensor, torch.Tensor]]
 EngineCorePromptLogprobsType = tuple[torch.Tensor, torch.Tensor]
 
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
index c2fc24442c7cd..c6634395bb167 100644
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -8,10 +8,7 @@ import torch_xla
 
 from vllm.platforms import current_platform
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
-
-# isort: off
 from vllm.v1.sample.tpu.sampler import apply_top_k_top_p as apply_top_k_top_p_tpu
-# isort: on
 
 if not current_platform.is_tpu():
     pytest.skip("This test needs a TPU.", allow_module_level=True)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index fc0634394ece3..ca286675ebd0c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -15,6 +15,7 @@ from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
 from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
 from .compressed_tensors_wNa16 import WNA16_SUPPORTED_BITS, CompressedTensorsWNA16
 
+# This avoids circular import error
 from .compressed_tensors_24 import CompressedTensors24  # isort: skip
 
 __all__ = [
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index d5a75e75aa43e..08bccee9e2d1a 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -80,17 +80,12 @@ from .interfaces import (
     SupportsMultiModal,
     SupportsPP,
 )
-
-# yapf conflicts with isort for this block
-# yapf: disable
 from .qwen2_5_omni_thinker import (
     Qwen2_5OmniConditionalGenerationMixin,
     Qwen2_5OmniThinkerDummyInputsBuilder,
     Qwen2_5OmniThinkerMultiModalProcessor,
     Qwen2_5OmniThinkerProcessingInfo,
 )
-
-# yapf: enable
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
     Qwen2_5_VisionRotaryEmbedding,

From 8f4b313c3790844d2d6ec9aeaa6dd0825c94752e Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Wed, 15 Oct 2025 20:11:48 +0800
Subject: [PATCH 36/51] [Misc] rename torch_dtype to dtype (#26695)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py                |  2 +-
 .../kernels/benchmark_moe_permute_unpermute.py     |  2 +-
 docs/features/quantization/auto_round.md           |  2 +-
 docs/features/quantization/fp8.md                  |  2 +-
 docs/features/quantization/int4.md                 |  2 +-
 docs/features/quantization/int8.md                 |  2 +-
 docs/features/quantization/quantized_kvcache.md    |  2 +-
 docs/features/quantization/quark.md                |  2 +-
 docs/features/quantization/torchao.md              |  2 +-
 requirements/common.txt                            |  2 +-
 tests/conftest.py                                  |  8 ++++----
 tests/models/multimodal/pooling/test_intern_vit.py |  2 +-
 tests/models/multimodal/pooling/test_radio.py      |  2 +-
 vllm/benchmarks/throughput.py                      |  2 +-
 vllm/config/model.py                               | 14 +++++++-------
 vllm/entrypoints/llm.py                            |  5 ++---
 vllm/model_executor/model_loader/tensorizer.py     |  2 +-
 vllm/model_executor/models/chameleon.py            |  2 +-
 vllm/model_executor/models/ernie45_vl.py           |  2 +-
 vllm/model_executor/models/glm4v.py                |  2 +-
 vllm/model_executor/models/longcat_flash.py        |  4 ++--
 vllm/model_executor/models/nano_nemotron_vl.py     |  4 ++--
 vllm/model_executor/models/qwen3_next.py           |  6 +++---
 vllm/model_executor/models/transformers.py         |  2 +-
 vllm/model_executor/models/transformers_pooling.py |  2 +-
 vllm/platforms/cuda.py                             |  4 ++--
 vllm/platforms/interface.py                        |  2 +-
 vllm/platforms/rocm.py                             |  4 ++--
 vllm/platforms/xpu.py                              |  4 ++--
 vllm/utils/__init__.py                             | 14 ++++++--------
 30 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index d3040e9738f7b..9298d3b58dfb9 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
     else:
         ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     block_quant_shape = get_weight_block_size_safety(config)
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index 04d2205aa3722..459eafa6d907d 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
 
     hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
     use_int8_w8a16 = args.dtype == "int8_w8a16"
     use_customized_permute = args.use_customized_permute
diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md
index ac766d5e29228..9c14f362b663f 100644
--- a/docs/features/quantization/auto_round.md
+++ b/docs/features/quantization/auto_round.md
@@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRound
 
 model_name = "Qwen/Qwen3-0.6B"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 bits, group_size, sym = 4, 128, True
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index a54acdbb96223..0c5111fb8af0d 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -43,7 +43,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
-    torch_dtype="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 5d8e06ffb5d77..035e7ea291f9e 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -41,7 +41,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
-    torch_dtype="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index ee1de21460573..ec8a77f74ffef 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -46,7 +46,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
-    torch_dtype="auto",
+    dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index e0585a88451d4..56cf057678be6 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -82,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
 
     # Select model and load it
     MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
     # Select calibration dataset
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index f0cd20b7335c2..385e3bbb8712f 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -50,7 +50,7 @@ to fetch model and tokenizer.
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         device_map="auto",
-        torch_dtype="auto",
+        dtype="auto",
     )
     model.eval()
 
diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md
index 6932445997012..b95b560882bb1 100644
--- a/docs/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht
     quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
     quantized_model = AutoModelForCausalLM.from_pretrained(
         model_name,
-        torch_dtype="auto",
+        dtype="auto",
         device_map="auto",
         quantization_config=quantization_config
     )
diff --git a/requirements/common.txt b/requirements/common.txt
index ec668e16d0e97..5e7769561c4f4 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.55.2
+transformers >= 4.56.0
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/tests/conftest.py b/tests/conftest.py
index 9126b3d668b9c..369acb92cfb91 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -334,7 +334,7 @@ class HfRunner:
             trust_remote_code=trust_remote_code,
         )
         self.device = self.get_default_device()
-        self.dtype = torch_dtype = _get_and_verify_dtype(
+        self.dtype = dtype = _get_and_verify_dtype(
             self.model_name,
             self.config,
             dtype=dtype,
@@ -342,7 +342,7 @@ class HfRunner:
         )
 
         model_kwargs = model_kwargs if model_kwargs is not None else {}
-        model_kwargs.setdefault("torch_dtype", torch_dtype)
+        model_kwargs.setdefault("dtype", dtype)
 
         if is_sentence_transformer:
             # Lazy init required for AMD CI
@@ -388,7 +388,7 @@ class HfRunner:
         if not skip_tokenizer_init:
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_name,
-                torch_dtype=torch_dtype,
+                dtype=dtype,
                 trust_remote_code=trust_remote_code,
             )
 
@@ -398,7 +398,7 @@ class HfRunner:
 
         self.processor = AutoProcessor.from_pretrained(
             model_name,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
             trust_remote_code=trust_remote_code,
         )
         if skip_tokenizer_init:
diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py
index b474e851319ae..74e30c4307fac 100644
--- a/tests/models/multimodal/pooling/test_intern_vit.py
+++ b/tests/models/multimodal/pooling/test_intern_vit.py
@@ -38,7 +38,7 @@ def run_intern_vit_test(
         config.norm_type = "rms_norm"
 
     hf_model = AutoModel.from_pretrained(
-        model, torch_dtype=torch_dtype, trust_remote_code=True
+        model, dtype=torch_dtype, trust_remote_code=True
     ).to("cuda")
     hf_outputs_per_image = [
         hf_model(pixel_value.to("cuda")).last_hidden_state
diff --git a/tests/models/multimodal/pooling/test_radio.py b/tests/models/multimodal/pooling/test_radio.py
index 80f594021ca8a..414e99a71e7b0 100644
--- a/tests/models/multimodal/pooling/test_radio.py
+++ b/tests/models/multimodal/pooling/test_radio.py
@@ -45,7 +45,7 @@ def run_radio_test(
     hf_model = AutoModel.from_pretrained(
         model_id,
         config=config,
-        torch_dtype=torch_dtype,
+        dtype=torch_dtype,
         trust_remote_code=True,
     ).to("cuda")
     hf_model.eval()
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 01c6824ac91f8..ad111a1ebd5be 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -251,7 +251,7 @@ def run_hf(
     disable_detokenize: bool = False,
 ) -> float:
     llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+        model, dtype=torch.float16, trust_remote_code=trust_remote_code
     )
     if llm.config.model_type == "llama":
         # To enable padding in the HF backend.
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 2be939eb654d8..6e5757ba037d5 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1837,18 +1837,18 @@ def _find_dtype(
     *,
     revision: str | None,
 ):
-    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
-    # because config.torch_dtype can be None.
-    config_dtype = getattr(config, "torch_dtype", None)
+    # NOTE: getattr(config, "dtype", torch.float32) is not correct
+    # because config.dtype can be None.
+    config_dtype = getattr(config, "dtype", None)
 
     # Fallbacks for multi-modal models if the root config
-    # does not define torch_dtype
+    # does not define dtype
     if config_dtype is None:
-        config_dtype = getattr(config.get_text_config(), "torch_dtype", None)
+        config_dtype = getattr(config.get_text_config(), "dtype", None)
     if config_dtype is None and hasattr(config, "vision_config"):
-        config_dtype = getattr(config.vision_config, "torch_dtype", None)
+        config_dtype = getattr(config.vision_config, "dtype", None)
     if config_dtype is None and hasattr(config, "encoder_config"):
-        config_dtype = getattr(config.encoder_config, "torch_dtype", None)
+        config_dtype = getattr(config.encoder_config, "dtype", None)
 
     # Try to read the dtype of the weights if they are in safetensors format
     if config_dtype is None:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e2db9d049a758..5883b92acd994 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -117,9 +117,8 @@ class LLM:
             execution with tensor parallelism.
         dtype: The data type for the model weights and activations. Currently,
             we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
-            the `torch_dtype` attribute specified in the model config file.
-            However, if the `torch_dtype` in the config is `float32`, we will
-            use `float16` instead.
+            the `dtype` attribute of the Transformers model's config. However,
+            if the `dtype` in the config is `float32`, we will use `float16` instead.
         quantization: The method used to quantize the model weights. Currently,
             we support "awq", "gptq", and "fp8" (experimental).
             If None, we first check the `quantization_config` attribute in the
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index ce5c0506979a2..4ebfba65ac805 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -518,7 +518,7 @@ def init_tensorizer_model(
 ) -> nn.Module:
     assert tensorizer_config.hf_config is not None
     model_args = tensorizer_config.hf_config
-    model_args.torch_dtype = tensorizer_config.dtype
+    model_args.dtype = tensorizer_config.dtype
     assert tensorizer_config.model_class is not None
     # TODO: Do we need to consider old-style model class?
     with meta_tensor_mode(), set_current_vllm_config(vllm_config, check_compile=True):
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 2ca761dd2b550..6f7e18d78bada 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -999,7 +999,7 @@ class ChameleonForConditionalGeneration(
             return []
         assert self.model.vqmodel is not None
         image_tokens = self.model.get_image_tokens(
-            image_input["data"].to(self.config.torch_dtype)
+            image_input["data"].to(self.config.dtype)
         )
         vision_embeddings = self.model.get_input_embeddings(image_tokens)
         return vision_embeddings
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index f40bd01deccd5..e5badc0a28f65 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -1089,7 +1089,7 @@ class Ernie4_5VLMultiModalProcessor(BaseMultiModalProcessor[Ernie4_5_VLProcessin
         pixel_values = (
             rescale_factor * pixel_values.to(torch.float32) - image_mean_tensor
         ) / image_std_tensor
-        pixel_values = pixel_values.to(hf_config.torch_dtype)
+        pixel_values = pixel_values.to(hf_config.dtype)
         return pixel_values
 
     def _call_hf_processor(
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 1bad8b0405467..a247ba55c51a0 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -615,7 +615,7 @@ class GLM4VForCausalLM(
         return None
 
     def _process_image_input(self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
-        pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)
+        pixel_values = image_input["data"].to(dtype=self.config.dtype)
 
         return self.transformer.vision(pixel_values)
 
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index 5d26e1c38eed4..5671347c00a23 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -114,7 +114,7 @@ class FlashConfig(PretrainedConfig):
         attention_dropout=0.0,
         mla_scale_q_lora=False,
         mla_scale_kv_lora=False,
-        torch_dtype="bfloat16",
+        dtype="bfloat16",
         params_dtype="bfloat16",
         router_dtype="float32",
         router_bias=False,
@@ -130,7 +130,7 @@ class FlashConfig(PretrainedConfig):
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
             params_dtype=params_dtype,
             router_dtype=router_dtype,
             topk_method=topk_method,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index e874aaa0fc7ad..77d77e7b9f86c 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -987,7 +987,7 @@ class NemotronH_Nano_VL_V2(
             prefix=maybe_prefix(prefix, "language_model"),
         )
         self.vision_model = self.get_vit_model_from_radio_config(config).to(
-            self.language_model.config.torch_dtype
+            self.language_model.config.dtype
         )
 
         # Construct the vision projection.
@@ -1008,7 +1008,7 @@ class NemotronH_Nano_VL_V2(
             ReLUSquaredActivation(),
             nn.Linear(vision_projection_hidden_size, llm_hidden_size, bias=False),
         )
-        self.mlp1 = self.mlp1.to(self.language_model.config.torch_dtype)
+        self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
 
         self.config = config
         self.model_config = vllm_config.model_config
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index a29def57c4a08..ac038aa3a958e 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -338,7 +338,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
             group_size=None,
             norm_before_gate=True,
             device=current_platform.current_device(),
-            dtype=config.torch_dtype,
+            dtype=config.dtype,
         )
 
         self.out_proj = RowParallelLinear(
@@ -847,7 +847,7 @@ class Qwen3NextDecoderLayer(nn.Module):
                     1,
                     1,
                     config.hidden_size,
-                    dtype=config.torch_dtype,
+                    dtype=config.dtype,
                 ),
             )
             self.ffn_layer_scale = torch.nn.Parameter(
@@ -855,7 +855,7 @@ class Qwen3NextDecoderLayer(nn.Module):
                     1,
                     1,
                     config.hidden_size,
-                    dtype=config.torch_dtype,
+                    dtype=config.dtype,
                 ),
             )
 
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 82f5410ece63f..a8709ea4268f9 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -530,7 +530,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         with init_on_device_without_buffers("meta"):
             self.model: PreTrainedModel = AutoModel.from_config(
                 self.config,
-                torch_dtype=self.model_config.dtype,
+                dtype=self.model_config.dtype,
                 trust_remote_code=self.model_config.trust_remote_code,
             )
 
diff --git a/vllm/model_executor/models/transformers_pooling.py b/vllm/model_executor/models/transformers_pooling.py
index 7ddeb403da448..7063a72748d77 100644
--- a/vllm/model_executor/models/transformers_pooling.py
+++ b/vllm/model_executor/models/transformers_pooling.py
@@ -157,7 +157,7 @@ class TransformersForSequenceClassification(TransformersPoolingBase):
         with torch.device("meta"):
             seq_cls_model = AutoModelForSequenceClassification.from_config(
                 self.config,
-                torch_dtype=self.model_config.dtype,
+                dtype=self.model_config.dtype,
                 trust_remote_code=self.model_config.trust_remote_code,
             )
 
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 04c2bbb43805b..a6b9df7c14462 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -500,8 +500,8 @@ class CudaPlatformBase(Platform):
         return supported
 
     @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
-        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
+        if dtype == torch.bfloat16:  # noqa: SIM102
             if not cls.has_device_capability(80):
                 capability = cls.get_device_capability()
                 gpu_name = cls.get_device_name()
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f08e62a4aa9c2..f9f2cc4d34e2d 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -563,7 +563,7 @@ class Platform:
         return False
 
     @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
         """
         Check if the dtype is supported by the current platform.
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 8fa07b10d34aa..b25b968893099 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -484,8 +484,8 @@ class RocmPlatform(Platform):
         return True
 
     @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
-        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
+        if dtype == torch.bfloat16:  # noqa: SIM102
             if not cls.has_device_capability(80):
                 capability = cls.get_device_capability()
                 gpu_name = cls.get_device_name()
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 4638e9fa30216..5e109cccfe761 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -236,8 +236,8 @@ class XPUPlatform(Platform):
         return torch.xpu.device_count()
 
     @classmethod
-    def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
-        if torch_dtype == torch.bfloat16:  # noqa: SIM102
+    def check_if_supports_dtype(cls, dtype: torch.dtype):
+        if dtype == torch.bfloat16:  # noqa: SIM102
             device_name = cls.get_device_name().lower()
             # client gpu a770
             if device_name.count("a770") > 0:
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index bb5d3a688094f..ad0918a6ed8d0 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -806,7 +806,7 @@ def create_kv_caches_with_random_flash(
 
     current_platform.seed_everything(seed)
 
-    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+    dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
     generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
     assert cache_layout in ("NHD", "HND")
     stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2, 4)
@@ -819,7 +819,7 @@ def create_kv_caches_with_random_flash(
 
     for _ in range(num_layers):
         key_value_cache = torch.empty(
-            size=kv_cache_allocation_shape, dtype=torch_dtype, device=device
+            size=kv_cache_allocation_shape, dtype=dtype, device=device
         ).permute(*stride_order)
         if cache_dtype in ["auto", "half", "bfloat16", "float"]:
             key_value_cache.uniform_(-scale, scale)
@@ -851,14 +851,14 @@ def create_kv_caches_with_random(
 
     current_platform.seed_everything(seed)
 
-    torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
+    dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype)
 
     scale = head_size**-0.5
-    x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
+    x = 16 // torch.tensor([], dtype=dtype).element_size()
     key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
     key_caches: list[torch.Tensor] = []
     for _ in range(num_layers):
-        key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, device=device)
+        key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device=device)
         if cache_dtype in ["auto", "half", "bfloat16", "float"]:
             key_cache.uniform_(-scale, scale)
         elif cache_dtype == "fp8":
@@ -870,9 +870,7 @@ def create_kv_caches_with_random(
     value_cache_shape = (num_blocks, num_heads, head_size, block_size)
     value_caches: list[torch.Tensor] = []
     for _ in range(num_layers):
-        value_cache = torch.empty(
-            size=value_cache_shape, dtype=torch_dtype, device=device
-        )
+        value_cache = torch.empty(size=value_cache_shape, dtype=dtype, device=device)
         if cache_dtype in ["auto", "half", "bfloat16", "float"]:
             value_cache.uniform_(-scale, scale)
         elif cache_dtype == "fp8":

From 5d598680e3b7e4751545481d6fba77597d068214 Mon Sep 17 00:00:00 2001
From: Max Wittig <max.wittig@siemens.com>
Date: Wed, 15 Oct 2025 14:40:33 +0200
Subject: [PATCH 37/51] chore: remove unused marker (#26890)

Signed-off-by: Max Wittig <max.wittig@siemens.com>
---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 95dda76063bc1..eb9bdb593baac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -107,7 +107,6 @@ markers = [
     "distributed: run this test only in distributed GPU tests",
     "skip_v1: do not run this test with v1",
     "optional: optional tests that are automatically skipped, include --optional to run them",
-    "extra_server_args: extra arguments to pass to the server fixture",
 ]
 
 [tool.ty.src]

From f57438338d819c8e3e7e70293281c575ebd77411 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Wed, 15 Oct 2025 05:51:45 -0700
Subject: [PATCH 38/51] [BugFix] Patch inductor memory plan logic (#26878)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/mkdocs/hooks/generate_argparse.py        |  9 ++-
 .../compile/piecewise/test_multiple_graphs.py |  6 +-
 vllm/env_override.py                          | 72 ++++++++++++++++++-
 vllm/utils/__init__.py                        | 27 +++++++
 4 files changed, 108 insertions(+), 6 deletions(-)

diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index ecd71ee1f3f66..a4da5b933e159 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -22,6 +22,11 @@ sys.modules["vllm._C"] = MagicMock()
 class PydanticMagicMock(MagicMock):
     """`MagicMock` that's able to generate pydantic-core schemas."""
 
+    def __init__(self, *args, **kwargs):
+        name = kwargs.pop("name", None)
+        super().__init__(*args, **kwargs)
+        self.__spec__ = importlib.machinery.ModuleSpec(name, None)
+
     def __get_pydantic_core_schema__(self, source_type, handler):
         return core_schema.any_schema()
 
@@ -42,7 +47,9 @@ def auto_mock(module, attr, max_mocks=50):
             raise e
         except ModuleNotFoundError as e:
             logger.info("Mocking %s for argparse doc generation", e.name)
-            sys.modules[e.name] = PydanticMagicMock()
+            sys.modules[e.name] = PydanticMagicMock(name=e.name)
+        except Exception as e:
+            logger.warning("Failed to import %s.%s: %s", module, attr, e)
 
     raise ImportError(
         f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py
index d1f741479acf4..246239b87d5fe 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -20,6 +20,7 @@ from vllm.config import (
     set_current_vllm_config,
 )
 from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils import is_torch_equal_or_newer
 
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
@@ -193,9 +194,8 @@ def run_model(
 
 @pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
 def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
-    if use_inductor_graph_partition:
-        # FIXME(luka/boyuan): this currently fails
-        pytest.skip("Inductor graph partition not supported with multi-graph")
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
     outputs = []
 
diff --git a/vllm/env_override.py b/vllm/env_override.py
index eb51dee1cf033..f4ac48584cb7e 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -3,9 +3,9 @@
 import os
 
 import torch
-from packaging import version
 
 from vllm.logger import init_logger
+from vllm.utils import is_torch_equal
 
 logger = init_logger(__name__)
 
@@ -23,6 +23,72 @@ os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 # see https://github.com/vllm-project/vllm/issues/10619
 torch._inductor.config.compile_threads = 1
 
+# ===================================================
+# torch 2.9 Inductor PythonWrapperCodegen monkeypatch
+# ===================================================
+# This change monkeypatches memory_plan_reuse in pytorch 2.9.0 to work around
+# a test failure for test_multi_graph_piecewise_compile_outputs_equal.
+# For more context, see https://github.com/pytorch/pytorch/pull/165514.
+
+
+def memory_plan_reuse_patched(self):
+    import torch._inductor.ir as ir
+    from torch._inductor.codegen.wrapper import (
+        EnterSubgraphLine,
+        ExitSubgraphLine,
+        MemoryPlanningLine,
+        MemoryPlanningState,
+        SubgraphPythonWrapperCodegen,
+    )
+    from torch._inductor.virtualized import V
+
+    def get_output_names(graph_outputs) -> list[str]:
+        import itertools
+
+        names = []
+        shape_counter = itertools.count(0)
+        none_counter = itertools.count(0)
+        for node in graph_outputs:
+            if isinstance(node, ir.NoneAsConstantBuffer):
+                names.append(f"{V.graph.name}_none{next(none_counter)}")
+            elif isinstance(node, ir.ShapeAsConstantBuffer):
+                names.append(f"{V.graph.name}_shape{next(shape_counter)}")
+            else:
+                names.append(node.get_name())
+        return names
+
+    if (
+        isinstance(V.graph.wrapper_code, SubgraphPythonWrapperCodegen)
+        and V.graph.wrapper_code.partition_signatures is not None
+    ):
+        out_names = get_output_names(
+            V.graph.wrapper_code.partition_signatures.output_nodes
+        )
+    else:
+        out_names = V.graph.get_output_names()
+
+    while (
+        self.lines
+        and isinstance(self.lines[-1], MemoryPlanningLine)
+        and self.lines[-1].node.name not in out_names  # type: ignore[attr-defined]
+    ):
+        # these lines will be pointless
+        self.lines.pop()
+
+    # codegen allocations in two passes
+    planning_states = [MemoryPlanningState()]
+    past_planning_states = []
+    for i in range(len(self.lines)):
+        line = self.lines[i]
+        if isinstance(line, MemoryPlanningLine):
+            self.lines[i] = line.plan(planning_states[-1])
+        elif isinstance(line, EnterSubgraphLine):
+            planning_states.append(MemoryPlanningState())
+        elif isinstance(line, ExitSubgraphLine):
+            past_planning_states.append(planning_states.pop())
+    past_planning_states.append(planning_states.pop())
+    assert len(planning_states) == 0
+
 
 # ========================================
 # torch 2.9 Inductor Scheduler monkeypatch
@@ -135,7 +201,9 @@ def _update_scheduler_patched(self) -> None:
         self.scheduler = Scheduler(self.operations)
 
 
-if version.parse(str(torch.__version__)) == version.parse("2.9.0"):
+if is_torch_equal("2.9.0"):
+    from torch._inductor.codegen.wrapper import PythonWrapperCodegen
     from torch._inductor.graph import GraphLowering
 
+    PythonWrapperCodegen.memory_plan_reuse = memory_plan_reuse_patched
     GraphLowering._update_scheduler = _update_scheduler_patched
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index ad0918a6ed8d0..1f01cbeda9686 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3263,6 +3263,33 @@ def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
     return torch_version >= version.parse(target)
 
 
+def _is_torch_equal(target: str) -> bool:
+    assert target.count(".") == 2
+    torch_version = str(torch.__version__)
+    torch_version = version.parse(torch_version)
+    # torch version is like "2.6.0.dev20240101" or "2.6.0.dev20240101+cpu"
+    # or "2.6.0+cu128" but never "2.6.0.1"
+    return (
+        torch_version >= version.parse(target)
+        and version.parse(target + ".1") > torch_version
+    )
+
+
+def is_torch_equal(target: str) -> bool:
+    """Check if the installed torch version is == the target version.
+
+    Args:
+        target: a version string, like "2.6.0".
+
+    Returns:
+        Whether the condition meets.
+    """
+    try:
+        return _is_torch_equal(target)
+    except Exception:
+        return Version(importlib.metadata.version("torch")) == Version(target)
+
+
 @cache
 def _has_module(module_name: str) -> bool:
     """Return True if *module_name* can be found in the current environment.

From 136a17fe6edacba292d899af4cfd424e0ca98d9f Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 15 Oct 2025 21:03:58 +0800
Subject: [PATCH 39/51] [Chore] Separate out `vllm.utils.func` (#26904)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/generation/test_common.py      |   2 +-
 tests/utils_/test_func_utils.py               |  97 +++++++
 tests/utils_/test_jsontree.py                 |  32 +++
 tests/utils_/test_utils.py                    | 122 +--------
 vllm/entrypoints/chat_utils.py                |   3 +-
 vllm/entrypoints/openai/serving_engine.py     |   2 +-
 vllm/entrypoints/openai/serving_score.py      |   3 +-
 vllm/executor/executor_base.py                |   2 +-
 vllm/executor/ray_distributed_executor.py     |   2 +-
 .../layers/fused_moe/deep_gemm_moe.py         |   3 +-
 vllm/model_executor/models/interfaces.py      |   2 +-
 vllm/model_executor/models/interfaces_base.py |   2 +-
 vllm/multimodal/processing.py                 |   3 +-
 vllm/tracing.py                               |   2 +-
 vllm/transformers_utils/processor.py          |   2 +-
 vllm/utils/__init__.py                        | 238 +---------------
 vllm/utils/func.py                            | 258 ++++++++++++++++++
 vllm/v1/engine/async_llm.py                   |   3 +-
 18 files changed, 407 insertions(+), 371 deletions(-)
 create mode 100644 tests/utils_/test_func_utils.py
 create mode 100644 tests/utils_/test_jsontree.py
 create mode 100644 vllm/utils/func.py

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index f124220bb16d9..af7dad079a9b3 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -17,7 +17,7 @@ from transformers import (
 )
 
 from vllm.platforms import current_platform
-from vllm.utils import identity
+from vllm.utils.func import identity
 
 from ....conftest import (
     IMAGE_ASSETS,
diff --git a/tests/utils_/test_func_utils.py b/tests/utils_/test_func_utils.py
new file mode 100644
index 0000000000000..147a396994596
--- /dev/null
+++ b/tests/utils_/test_func_utils.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+
+import pytest
+
+from vllm.utils.func import deprecate_kwargs, supports_kw
+
+from ..utils import error_on_warning
+
+
+def test_deprecate_kwargs_always():
+    @deprecate_kwargs("old_arg", is_deprecated=True)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="'old_arg'"):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_never():
+    @deprecate_kwargs("old_arg", is_deprecated=False)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with error_on_warning(DeprecationWarning):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_dynamic():
+    is_deprecated = True
+
+    @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="'old_arg'"):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+    is_deprecated = False
+
+    with error_on_warning(DeprecationWarning):
+        dummy(old_arg=1)
+
+    with error_on_warning(DeprecationWarning):
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_additional_message():
+    @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="abcd"):
+        dummy(old_arg=1)
+
+
+@pytest.mark.parametrize(
+    ("callable", "kw_name", "requires_kw_only", "allow_var_kwargs", "is_supported"),
+    [
+        # Tests for positional argument support
+        (lambda foo: None, "foo", True, True, False),
+        (lambda foo: None, "foo", False, True, True),
+        # Tests for positional or keyword / keyword only
+        (lambda foo=100: None, "foo", True, True, False),
+        (lambda *, foo: None, "foo", False, True, True),
+        # Tests to make sure the names of variadic params are NOT supported
+        (lambda *args: None, "args", False, True, False),
+        (lambda **kwargs: None, "kwargs", False, True, False),
+        # Tests for if we allow var kwargs to add support
+        (lambda foo: None, "something_else", False, True, False),
+        (lambda foo, **kwargs: None, "something_else", False, True, True),
+        (lambda foo, **kwargs: None, "kwargs", True, True, False),
+        (lambda foo, **kwargs: None, "foo", True, True, False),
+    ],
+)
+def test_supports_kw(
+    callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
+):
+    assert (
+        supports_kw(
+            callable=callable,
+            kw_name=kw_name,
+            requires_kw_only=requires_kw_only,
+            allow_var_kwargs=allow_var_kwargs,
+        )
+        == is_supported
+    )
diff --git a/tests/utils_/test_jsontree.py b/tests/utils_/test_jsontree.py
new file mode 100644
index 0000000000000..0af2751b2638c
--- /dev/null
+++ b/tests/utils_/test_jsontree.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.utils.jsontree import json_count_leaves
+
+
+def test_json_count_leaves():
+    """Test json_count_leaves function from jsontree utility."""
+
+    # Single leaf values
+    assert json_count_leaves(42) == 1
+    assert json_count_leaves("hello") == 1
+    assert json_count_leaves(None) == 1
+
+    # Empty containers
+    assert json_count_leaves([]) == 0
+    assert json_count_leaves({}) == 0
+    assert json_count_leaves(()) == 0
+
+    # Flat structures
+    assert json_count_leaves([1, 2, 3]) == 3
+    assert json_count_leaves({"a": 1, "b": 2}) == 2
+    assert json_count_leaves((1, 2, 3)) == 3
+
+    # Nested structures
+    nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
+    assert json_count_leaves(nested_dict) == 3
+
+    nested_list = [1, [2, 3], 4]
+    assert json_count_leaves(nested_list) == 4
+
+    mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
+    assert json_count_leaves(mixed_nested) == 4
diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index af5fc758f2c26..b4883a4fea31a 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -30,7 +30,6 @@ from vllm.utils import (
     bind_kv_cache,
     common_broadcastable_dtype,
     current_stream,
-    deprecate_kwargs,
     get_open_port,
     get_tcp_uri,
     is_lossless_cast,
@@ -42,12 +41,11 @@ from vllm.utils import (
     sha256,
     split_host_port,
     split_zmq_path,
-    supports_kw,
     swap_dict_values,
     unique_filepath,
 )
 
-from ..utils import create_new_process_for_each_test, error_on_warning
+from ..utils import create_new_process_for_each_test
 
 
 @pytest.mark.asyncio
@@ -83,61 +81,6 @@ async def test_merge_async_iterators():
             raise AssertionError() from e
 
 
-def test_deprecate_kwargs_always():
-    @deprecate_kwargs("old_arg", is_deprecated=True)
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-
-    with pytest.warns(DeprecationWarning, match="'old_arg'"):
-        dummy(old_arg=1)
-
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-
-
-def test_deprecate_kwargs_never():
-    @deprecate_kwargs("old_arg", is_deprecated=False)
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-
-    with error_on_warning(DeprecationWarning):
-        dummy(old_arg=1)
-
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-
-
-def test_deprecate_kwargs_dynamic():
-    is_deprecated = True
-
-    @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated)
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-
-    with pytest.warns(DeprecationWarning, match="'old_arg'"):
-        dummy(old_arg=1)
-
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-
-    is_deprecated = False
-
-    with error_on_warning(DeprecationWarning):
-        dummy(old_arg=1)
-
-    with error_on_warning(DeprecationWarning):
-        dummy(new_arg=1)
-
-
-def test_deprecate_kwargs_additional_message():
-    @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
-    def dummy(*, old_arg: object = None, new_arg: object = None):
-        pass
-
-    with pytest.warns(DeprecationWarning, match="abcd"):
-        dummy(old_arg=1)
-
-
 def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_PORT", "5678")
@@ -383,39 +326,6 @@ def test_duplicate_dict_args(caplog_vllm, parser):
     assert "-O.mode" in caplog_vllm.text
 
 
-@pytest.mark.parametrize(
-    "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
-    [
-        # Tests for positional argument support
-        (lambda foo: None, "foo", True, True, False),
-        (lambda foo: None, "foo", False, True, True),
-        # Tests for positional or keyword / keyword only
-        (lambda foo=100: None, "foo", True, True, False),
-        (lambda *, foo: None, "foo", False, True, True),
-        # Tests to make sure the names of variadic params are NOT supported
-        (lambda *args: None, "args", False, True, False),
-        (lambda **kwargs: None, "kwargs", False, True, False),
-        # Tests for if we allow var kwargs to add support
-        (lambda foo: None, "something_else", False, True, False),
-        (lambda foo, **kwargs: None, "something_else", False, True, True),
-        (lambda foo, **kwargs: None, "kwargs", True, True, False),
-        (lambda foo, **kwargs: None, "foo", True, True, False),
-    ],
-)
-def test_supports_kw(
-    callable, kw_name, requires_kw_only, allow_var_kwargs, is_supported
-):
-    assert (
-        supports_kw(
-            callable=callable,
-            kw_name=kw_name,
-            requires_kw_only=requires_kw_only,
-            allow_var_kwargs=allow_var_kwargs,
-        )
-        == is_supported
-    )
-
-
 @create_new_process_for_each_test()
 def test_memory_profiling():
     # Fake out some model loading + inference memory usage to test profiling
@@ -863,36 +773,6 @@ def test_join_host_port():
     assert join_host_port("::1", 5555) == "[::1]:5555"
 
 
-def test_json_count_leaves():
-    """Test json_count_leaves function from jsontree utility."""
-    from vllm.utils.jsontree import json_count_leaves
-
-    # Single leaf values
-    assert json_count_leaves(42) == 1
-    assert json_count_leaves("hello") == 1
-    assert json_count_leaves(None) == 1
-
-    # Empty containers
-    assert json_count_leaves([]) == 0
-    assert json_count_leaves({}) == 0
-    assert json_count_leaves(()) == 0
-
-    # Flat structures
-    assert json_count_leaves([1, 2, 3]) == 3
-    assert json_count_leaves({"a": 1, "b": 2}) == 2
-    assert json_count_leaves((1, 2, 3)) == 3
-
-    # Nested structures
-    nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
-    assert json_count_leaves(nested_dict) == 3
-
-    nested_list = [1, [2, 3], 4]
-    assert json_count_leaves(nested_list) == 4
-
-    mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
-    assert json_count_leaves(mixed_nested) == 4
-
-
 def test_convert_ids_list_to_tokens():
     tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
     token_ids = tokenizer.encode("Hello, world!")
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 21973018a2b64..0d8b0280d5045 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -50,7 +50,8 @@ from vllm.multimodal.utils import MediaConnector
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import random_uuid, supports_kw
+from vllm.utils import random_uuid
+from vllm.utils.func import supports_kw
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 3965d2dac0887..c318c0f425bd2 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -94,10 +94,10 @@ from vllm.utils import (
     AsyncMicrobatchTokenizer,
     collect_from_async_generator,
     is_list_of,
-    make_async,
     merge_async_iterators,
     random_uuid,
 )
+from vllm.utils.func import make_async
 from vllm.v1.engine import EngineCoreRequest
 
 logger = init_logger(__name__)
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 7506e17fe585b..e5c7f80a17533 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -37,7 +37,8 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import make_async, merge_async_iterators
+from vllm.utils import merge_async_iterators
+from vllm.utils.func import make_async
 
 logger = init_logger(__name__)
 
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index a5f83f9040023..093d5e97fd3e4 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -17,7 +17,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.tasks import SupportedTask
-from vllm.utils import make_async
+from vllm.utils.func import make_async
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.worker.worker_base import WorkerBase
 
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 59e282ac92b6d..a57b64152f49c 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -24,8 +24,8 @@ from vllm.utils import (
     get_distributed_init_method,
     get_ip,
     get_open_port,
-    make_async,
 )
+from vllm.utils.func import make_async
 from vllm.v1.outputs import SamplerOutput
 
 if ray is not None:
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 350c21e0a95bc..169b14ba46eb9 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -27,8 +27,9 @@ from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8,
 )
-from vllm.utils import has_deep_gemm, run_once
+from vllm.utils import has_deep_gemm
 from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous
+from vllm.utils.func import run_once
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index d25a0c18d1659..2487d7a691135 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -24,7 +24,7 @@ from vllm.inputs import TokensPrompt
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.utils import supports_kw
+from vllm.utils.func import supports_kw
 
 from .interfaces_base import VllmModel, is_pooling_model
 
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index afb94f7c35467..da1ffd2548274 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -15,7 +15,7 @@ import torch.nn as nn
 from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
-from vllm.utils import supports_kw
+from vllm.utils.func import supports_kw
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 5d9876539499d..96055551c26ef 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -25,7 +25,8 @@ from typing_extensions import TypeVar, assert_never
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
 from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
-from vllm.utils import flatten_2d_lists, full_groupby, get_allowed_kwarg_only_overrides
+from vllm.utils import flatten_2d_lists, full_groupby
+from vllm.utils.func import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves
 
 from .hasher import MultiModalHasher
diff --git a/vllm/tracing.py b/vllm/tracing.py
index 7e3e883ca5f2d..b4008064fef0e 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -5,7 +5,7 @@ import os
 from collections.abc import Mapping
 
 from vllm.logger import init_logger
-from vllm.utils import run_once
+from vllm.utils.func import run_once
 
 TRACE_HEADERS = ["traceparent", "tracestate"]
 
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 0a55ac96ccf89..cdc138064a33c 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -16,7 +16,7 @@ from transformers.processing_utils import ProcessorMixin
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
 
-from vllm.utils import get_allowed_kwarg_only_overrides
+from vllm.utils.func import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 1f01cbeda9686..5fd94b7b40492 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
-import concurrent
 import contextlib
 import datetime
 import enum
@@ -43,7 +42,6 @@ from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import UserDict, defaultdict
 from collections.abc import (
     AsyncGenerator,
-    Awaitable,
     Callable,
     Collection,
     Generator,
@@ -85,7 +83,7 @@ from packaging import version
 from packaging.version import Version
 from torch.library import Library
 from transformers.tokenization_utils_base import BatchEncoding
-from typing_extensions import Never, ParamSpec, TypeIs, assert_never
+from typing_extensions import Never, TypeIs, assert_never
 
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
@@ -174,7 +172,6 @@ def set_default_torch_num_threads(num_threads: int):
     torch.set_num_threads(old_num_threads)
 
 
-P = ParamSpec("P")
 T = TypeVar("T")
 U = TypeVar("U")
 
@@ -452,24 +449,6 @@ def in_loop(event_loop: AbstractEventLoop) -> bool:
         return False
 
 
-def make_async(
-    func: Callable[P, T], executor: concurrent.futures.Executor | None = None
-) -> Callable[P, Awaitable[T]]:
-    """Take a blocking function, and run it on in an executor thread.
-
-    This function prevents the blocking function from blocking the
-    asyncio event loop.
-    The code in this function needs to be thread safe.
-    """
-
-    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future:
-        loop = asyncio.get_event_loop()
-        p_func = partial(func, *args, **kwargs)
-        return loop.run_in_executor(executor=executor, func=p_func)
-
-    return _async_wrapper
-
-
 async def merge_async_iterators(
     *iterators: AsyncGenerator[T, None],
 ) -> AsyncGenerator[tuple[int, T], None]:
@@ -1254,90 +1233,6 @@ def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
         enable_trace_function_call(log_path)
 
 
-# `functools` helpers
-def identity(value: T, **kwargs) -> T:
-    """Returns the first provided value."""
-    return value
-
-
-F = TypeVar("F", bound=Callable[..., Any])
-
-
-def deprecate_args(
-    start_index: int,
-    is_deprecated: bool | Callable[[], bool] = True,
-    additional_message: str | None = None,
-) -> Callable[[F], F]:
-    if not callable(is_deprecated):
-        is_deprecated = partial(identity, is_deprecated)
-
-    def wrapper(fn: F) -> F:
-        params = inspect.signature(fn).parameters
-        pos_types = (
-            inspect.Parameter.POSITIONAL_ONLY,
-            inspect.Parameter.POSITIONAL_OR_KEYWORD,
-        )
-        pos_kws = [kw for kw, param in params.items() if param.kind in pos_types]
-
-        @wraps(fn)
-        def inner(*args, **kwargs):
-            if is_deprecated():
-                deprecated_args = pos_kws[start_index : len(args)]
-                if deprecated_args:
-                    msg = (
-                        f"The positional arguments {deprecated_args} are "
-                        "deprecated and will be removed in a future update."
-                    )
-                    if additional_message is not None:
-                        msg += f" {additional_message}"
-
-                    warnings.warn(
-                        DeprecationWarning(msg),
-                        stacklevel=3,  # The inner function takes up one level
-                    )
-
-            return fn(*args, **kwargs)
-
-        return inner  # type: ignore
-
-    return wrapper
-
-
-def deprecate_kwargs(
-    *kws: str,
-    is_deprecated: bool | Callable[[], bool] = True,
-    additional_message: str | None = None,
-) -> Callable[[F], F]:
-    deprecated_kws = set(kws)
-
-    if not callable(is_deprecated):
-        is_deprecated = partial(identity, is_deprecated)
-
-    def wrapper(fn: F) -> F:
-        @wraps(fn)
-        def inner(*args, **kwargs):
-            if is_deprecated():
-                deprecated_kwargs = kwargs.keys() & deprecated_kws
-                if deprecated_kwargs:
-                    msg = (
-                        f"The keyword arguments {deprecated_kwargs} are "
-                        "deprecated and will be removed in a future update."
-                    )
-                    if additional_message is not None:
-                        msg += f" {additional_message}"
-
-                    warnings.warn(
-                        DeprecationWarning(msg),
-                        stacklevel=3,  # The inner function takes up one level
-                    )
-
-            return fn(*args, **kwargs)
-
-        return inner  # type: ignore
-
-    return wrapper
-
-
 @lru_cache(maxsize=8)
 def _cuda_device_count_stateless(cuda_visible_devices: str | None = None) -> int:
     # Note: cuda_visible_devices is not used, but we keep it as an argument for
@@ -1426,21 +1321,6 @@ def weak_bind(
     return weak_bound
 
 
-def run_once(f: Callable[P, None]) -> Callable[P, None]:
-    def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
-        if wrapper.has_run:  # type: ignore[attr-defined]
-            return
-
-        with wrapper.lock:  # type: ignore[attr-defined]
-            if not wrapper.has_run:  # type: ignore[attr-defined]
-                wrapper.has_run = True  # type: ignore[attr-defined]
-                return f(*args, **kwargs)
-
-    wrapper.has_run = False  # type: ignore[attr-defined]
-    wrapper.lock = threading.Lock()  # type: ignore[attr-defined]
-    return wrapper
-
-
 class StoreBoolean(Action):
     def __call__(self, parser, namespace, values, option_string=None):
         if values.lower() == "true":
@@ -1929,122 +1809,6 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, **kwarg
         return await task(*args, **kwargs)
 
 
-@lru_cache
-def supports_kw(
-    callable: Callable[..., object],
-    kw_name: str,
-    *,
-    requires_kw_only: bool = False,
-    allow_var_kwargs: bool = True,
-) -> bool:
-    """Check if a keyword is a valid kwarg for a callable; if requires_kw_only
-    disallows kwargs names that can also be positional arguments.
-    """
-    params = inspect.signature(callable).parameters
-    if not params:
-        return False
-
-    param_val = params.get(kw_name)
-
-    # Types where the it may be valid, i.e., explicitly defined & nonvariadic
-    passable_kw_types = set(
-        (
-            inspect.Parameter.POSITIONAL_ONLY,
-            inspect.Parameter.POSITIONAL_OR_KEYWORD,
-            inspect.Parameter.KEYWORD_ONLY,
-        )
-    )
-
-    if param_val:
-        is_sig_param = param_val.kind in passable_kw_types
-        # We want kwargs only, but this is passable as a positional arg
-        if (
-            requires_kw_only
-            and is_sig_param
-            and param_val.kind != inspect.Parameter.KEYWORD_ONLY
-        ):
-            return False
-        if (requires_kw_only and param_val.kind == inspect.Parameter.KEYWORD_ONLY) or (
-            not requires_kw_only and is_sig_param
-        ):
-            return True
-
-    # If we're okay with var-kwargs, it's supported as long as
-    # the kw_name isn't something like *args, **kwargs
-    if allow_var_kwargs:
-        # Get the last param; type is ignored here because params is a proxy
-        # mapping, but it wraps an ordered dict, and they appear in order.
-        # Ref: https://docs.python.org/3/library/inspect.html#inspect.Signature.parameters
-        last_param = params[next(reversed(params))]  # type: ignore
-        return (
-            last_param.kind == inspect.Parameter.VAR_KEYWORD
-            and last_param.name != kw_name
-        )
-
-    return False
-
-
-def get_allowed_kwarg_only_overrides(
-    callable: Callable[..., object],
-    overrides: Mapping[str, object] | None,
-    *,
-    requires_kw_only: bool = True,
-    allow_var_kwargs: bool = False,
-) -> dict[str, Any]:
-    """
-    Given a callable which has one or more keyword only params and a dict
-    mapping param names to values, drop values that can be not be kwarg
-    expanded to overwrite one or more keyword-only args. This is used in a
-    few places to handle custom processor overrides for multimodal models,
-    e.g., for profiling when processor options provided by the user
-    may affect the number of mm tokens per instance.
-
-    Args:
-        callable: Callable which takes 0 or more keyword only arguments.
-                  If None is provided, all overrides names are allowed.
-        overrides: Potential overrides to be used when invoking the callable.
-        allow_var_kwargs: Allows overrides that are expandable for var kwargs.
-
-    Returns:
-        Dictionary containing the kwargs to be leveraged which may be used
-        to overwrite one or more keyword only arguments when invoking the
-        callable.
-    """
-    if not overrides:
-        return {}
-
-    # Drop any mm_processor_kwargs provided by the user that
-    # are not kwargs, unless it can fit it var_kwargs param
-    filtered_overrides = {
-        kwarg_name: val
-        for kwarg_name, val in overrides.items()
-        if supports_kw(
-            callable,
-            kwarg_name,
-            requires_kw_only=requires_kw_only,
-            allow_var_kwargs=allow_var_kwargs,
-        )
-    }
-
-    # If anything is dropped, log a warning
-    dropped_keys = overrides.keys() - filtered_overrides.keys()
-    if dropped_keys:
-        if requires_kw_only:
-            logger.warning(
-                "The following intended overrides are not keyword-only args "
-                "and will be dropped: %s",
-                dropped_keys,
-            )
-        else:
-            logger.warning(
-                "The following intended overrides are not keyword args "
-                "and will be dropped: %s",
-                dropped_keys,
-            )
-
-    return filtered_overrides
-
-
 # Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
 # In particular, the FakeScalarType is not supported for earlier versions of
 # PyTorch which breaks dynamo for any ops registered using ScalarType.
diff --git a/vllm/utils/func.py b/vllm/utils/func.py
new file mode 100644
index 0000000000000..bd26b29d5f6dc
--- /dev/null
+++ b/vllm/utils/func.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Contains helpers that are applied to functions.
+
+This is similar in concept to the `functools` module.
+"""
+
+import asyncio
+import concurrent.futures
+import inspect
+import threading
+import warnings
+from collections.abc import Awaitable, Callable, Mapping
+from functools import lru_cache, partial, wraps
+from typing import Any, TypeVar
+
+from typing_extensions import ParamSpec
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+P = ParamSpec("P")
+T = TypeVar("T")
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+def identity(value: T, **kwargs) -> T:
+    """Returns the first provided value."""
+    return value
+
+
+def make_async(
+    func: Callable[P, T],
+    executor: concurrent.futures.Executor | None = None,
+) -> Callable[P, Awaitable[T]]:
+    """
+    Take a blocking function, and run it on in an executor thread.
+
+    This function prevents the blocking function from blocking the
+    asyncio event loop.
+    The code in this function needs to be thread safe.
+    """
+
+    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future[T]:
+        loop = asyncio.get_event_loop()
+        p_func = partial(func, *args, **kwargs)
+        return loop.run_in_executor(executor=executor, func=p_func)
+
+    return _async_wrapper
+
+
+def run_once(f: Callable[P, None]) -> Callable[P, None]:
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
+        if wrapper.has_run:  # type: ignore[attr-defined]
+            return
+
+        with wrapper.lock:  # type: ignore[attr-defined]
+            if not wrapper.has_run:  # type: ignore[attr-defined]
+                wrapper.has_run = True  # type: ignore[attr-defined]
+                return f(*args, **kwargs)
+
+    wrapper.has_run = False  # type: ignore[attr-defined]
+    wrapper.lock = threading.Lock()  # type: ignore[attr-defined]
+    return wrapper
+
+
+def deprecate_args(
+    start_index: int,
+    is_deprecated: bool | Callable[[], bool] = True,
+    additional_message: str | None = None,
+) -> Callable[[F], F]:
+    if not callable(is_deprecated):
+        is_deprecated = partial(identity, is_deprecated)
+
+    def wrapper(fn: F) -> F:
+        params = inspect.signature(fn).parameters
+        pos_types = (
+            inspect.Parameter.POSITIONAL_ONLY,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        )
+        pos_kws = [kw for kw, param in params.items() if param.kind in pos_types]
+
+        @wraps(fn)
+        def inner(*args, **kwargs):
+            if is_deprecated():
+                deprecated_args = pos_kws[start_index : len(args)]
+                if deprecated_args:
+                    msg = (
+                        f"The positional arguments {deprecated_args} are "
+                        "deprecated and will be removed in a future update."
+                    )
+                    if additional_message is not None:
+                        msg += f" {additional_message}"
+
+                    warnings.warn(
+                        DeprecationWarning(msg),
+                        stacklevel=3,  # The inner function takes up one level
+                    )
+
+            return fn(*args, **kwargs)
+
+        return inner  # type: ignore
+
+    return wrapper
+
+
+def deprecate_kwargs(
+    *kws: str,
+    is_deprecated: bool | Callable[[], bool] = True,
+    additional_message: str | None = None,
+) -> Callable[[F], F]:
+    deprecated_kws = set(kws)
+
+    if not callable(is_deprecated):
+        is_deprecated = partial(identity, is_deprecated)
+
+    def wrapper(fn: F) -> F:
+        @wraps(fn)
+        def inner(*args, **kwargs):
+            if is_deprecated():
+                deprecated_kwargs = kwargs.keys() & deprecated_kws
+                if deprecated_kwargs:
+                    msg = (
+                        f"The keyword arguments {deprecated_kwargs} are "
+                        "deprecated and will be removed in a future update."
+                    )
+                    if additional_message is not None:
+                        msg += f" {additional_message}"
+
+                    warnings.warn(
+                        DeprecationWarning(msg),
+                        stacklevel=3,  # The inner function takes up one level
+                    )
+
+            return fn(*args, **kwargs)
+
+        return inner  # type: ignore
+
+    return wrapper
+
+
+@lru_cache
+def supports_kw(
+    callable: Callable[..., object],
+    kw_name: str,
+    *,
+    requires_kw_only: bool = False,
+    allow_var_kwargs: bool = True,
+) -> bool:
+    """Check if a keyword is a valid kwarg for a callable; if requires_kw_only
+    disallows kwargs names that can also be positional arguments.
+    """
+    params = inspect.signature(callable).parameters
+    if not params:
+        return False
+
+    param_val = params.get(kw_name)
+
+    # Types where the it may be valid, i.e., explicitly defined & nonvariadic
+    passable_kw_types = set(
+        (
+            inspect.Parameter.POSITIONAL_ONLY,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            inspect.Parameter.KEYWORD_ONLY,
+        )
+    )
+
+    if param_val:
+        is_sig_param = param_val.kind in passable_kw_types
+        # We want kwargs only, but this is passable as a positional arg
+        if (
+            requires_kw_only
+            and is_sig_param
+            and param_val.kind != inspect.Parameter.KEYWORD_ONLY
+        ):
+            return False
+        if (requires_kw_only and param_val.kind == inspect.Parameter.KEYWORD_ONLY) or (
+            not requires_kw_only and is_sig_param
+        ):
+            return True
+
+    # If we're okay with var-kwargs, it's supported as long as
+    # the kw_name isn't something like *args, **kwargs
+    if allow_var_kwargs:
+        # Get the last param; type is ignored here because params is a proxy
+        # mapping, but it wraps an ordered dict, and they appear in order.
+        # Ref: https://docs.python.org/3/library/inspect.html#inspect.Signature.parameters
+        last_param = params[next(reversed(params))]  # type: ignore
+        return (
+            last_param.kind == inspect.Parameter.VAR_KEYWORD
+            and last_param.name != kw_name
+        )
+
+    return False
+
+
+def get_allowed_kwarg_only_overrides(
+    callable: Callable[..., object],
+    overrides: Mapping[str, object] | None,
+    *,
+    requires_kw_only: bool = True,
+    allow_var_kwargs: bool = False,
+) -> dict[str, Any]:
+    """
+    Given a callable which has one or more keyword only params and a dict
+    mapping param names to values, drop values that can be not be kwarg
+    expanded to overwrite one or more keyword-only args. This is used in a
+    few places to handle custom processor overrides for multimodal models,
+    e.g., for profiling when processor options provided by the user
+    may affect the number of mm tokens per instance.
+
+    Args:
+        callable: Callable which takes 0 or more keyword only arguments.
+                  If None is provided, all overrides names are allowed.
+        overrides: Potential overrides to be used when invoking the callable.
+        allow_var_kwargs: Allows overrides that are expandable for var kwargs.
+
+    Returns:
+        Dictionary containing the kwargs to be leveraged which may be used
+        to overwrite one or more keyword only arguments when invoking the
+        callable.
+    """
+    if not overrides:
+        return {}
+
+    # Drop any mm_processor_kwargs provided by the user that
+    # are not kwargs, unless it can fit it var_kwargs param
+    filtered_overrides = {
+        kwarg_name: val
+        for kwarg_name, val in overrides.items()
+        if supports_kw(
+            callable,
+            kwarg_name,
+            requires_kw_only=requires_kw_only,
+            allow_var_kwargs=allow_var_kwargs,
+        )
+    }
+
+    # If anything is dropped, log a warning
+    dropped_keys = overrides.keys() - filtered_overrides.keys()
+    if dropped_keys:
+        if requires_kw_only:
+            logger.warning(
+                "The following intended overrides are not keyword-only args "
+                "and will be dropped: %s",
+                dropped_keys,
+            )
+        else:
+            logger.warning(
+                "The following intended overrides are not keyword args "
+                "and will be dropped: %s",
+                dropped_keys,
+            )
+
+    return filtered_overrides
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 0ec153e233161..c8fb30f96c0a0 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -29,7 +29,8 @@ from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
 from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, as_list, cancel_task_threadsafe, cdiv, deprecate_kwargs
+from vllm.utils import Device, as_list, cancel_task_threadsafe, cdiv
+from vllm.utils.func import deprecate_kwargs
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError

From 828523ad8e26d15e5cec27ba623310a950f53121 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 15 Oct 2025 23:33:00 +0800
Subject: [PATCH 40/51] [Chore] Separate out `vllm.utils.async_utils` (#26913)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/lora/test_add_lora.py                   |   2 +-
 tests/utils_/test_async_utils.py              |  42 +++
 tests/utils_/test_utils.py                    |  36 ---
 vllm/benchmarks/throughput.py                 |   2 +-
 vllm/entrypoints/openai/serving_completion.py |   3 +-
 vllm/entrypoints/openai/serving_embedding.py  |   3 +-
 vllm/entrypoints/openai/serving_engine.py     |   7 +-
 vllm/entrypoints/openai/serving_pooling.py    |   2 +-
 vllm/entrypoints/openai/serving_score.py      |   3 +-
 vllm/entrypoints/renderer.py                  |   2 +-
 vllm/executor/executor_base.py                |   2 +-
 vllm/executor/ray_distributed_executor.py     |   9 +-
 vllm/utils/__init__.py                        | 277 ----------------
 vllm/utils/async_utils.py                     | 299 ++++++++++++++++++
 vllm/utils/func.py                            |  24 +-
 vllm/v1/engine/async_llm.py                   |   3 +-
 vllm/v1/engine/core_client.py                 |   2 +-
 17 files changed, 364 insertions(+), 354 deletions(-)
 create mode 100644 tests/utils_/test_async_utils.py
 create mode 100644 vllm/utils/async_utils.py

diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 2f28253bce536..9a82ab99ea9c9 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.api_server import (
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
-from vllm.utils import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 
 MODEL_PATH = "zai-org/chatglm3-6b"
 LORA_RANK = 64
diff --git a/tests/utils_/test_async_utils.py b/tests/utils_/test_async_utils.py
new file mode 100644
index 0000000000000..03d116bdfd814
--- /dev/null
+++ b/tests/utils_/test_async_utils.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+from collections.abc import AsyncIterator
+
+import pytest
+
+from vllm.utils.async_utils import merge_async_iterators
+
+
+async def _mock_async_iterator(idx: int):
+    try:
+        while True:
+            yield f"item from iterator {idx}"
+            await asyncio.sleep(0.1)
+    except asyncio.CancelledError:
+        print(f"iterator {idx} cancelled")
+
+
+@pytest.mark.asyncio
+async def test_merge_async_iterators():
+    iterators = [_mock_async_iterator(i) for i in range(3)]
+    merged_iterator = merge_async_iterators(*iterators)
+
+    async def stream_output(generator: AsyncIterator[tuple[int, str]]):
+        async for idx, output in generator:
+            print(f"idx: {idx}, output: {output}")
+
+    task = asyncio.create_task(stream_output(merged_iterator))
+    await asyncio.sleep(0.5)
+    task.cancel()
+    with pytest.raises(asyncio.CancelledError):
+        await task
+
+    for iterator in iterators:
+        try:
+            await asyncio.wait_for(anext(iterator), 1)
+        except StopAsyncIteration:
+            # All iterators should be cancelled and print this message.
+            print("Iterator was cancelled normally")
+        except (Exception, asyncio.CancelledError) as e:
+            raise AssertionError() from e
diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index b4883a4fea31a..3bc4d3536d58e 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -2,14 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 
-import asyncio
 import hashlib
 import json
 import os
 import pickle
 import socket
 import tempfile
-from collections.abc import AsyncIterator
 from pathlib import Path
 from unittest.mock import patch
 
@@ -37,7 +35,6 @@ from vllm.utils import (
     make_zmq_path,
     make_zmq_socket,
     memory_profiling,
-    merge_async_iterators,
     sha256,
     split_host_port,
     split_zmq_path,
@@ -48,39 +45,6 @@ from vllm.utils import (
 from ..utils import create_new_process_for_each_test
 
 
-@pytest.mark.asyncio
-async def test_merge_async_iterators():
-    async def mock_async_iterator(idx: int):
-        try:
-            while True:
-                yield f"item from iterator {idx}"
-                await asyncio.sleep(0.1)
-        except asyncio.CancelledError:
-            print(f"iterator {idx} cancelled")
-
-    iterators = [mock_async_iterator(i) for i in range(3)]
-    merged_iterator = merge_async_iterators(*iterators)
-
-    async def stream_output(generator: AsyncIterator[tuple[int, str]]):
-        async for idx, output in generator:
-            print(f"idx: {idx}, output: {output}")
-
-    task = asyncio.create_task(stream_output(merged_iterator))
-    await asyncio.sleep(0.5)
-    task.cancel()
-    with pytest.raises(asyncio.CancelledError):
-        await task
-
-    for iterator in iterators:
-        try:
-            await asyncio.wait_for(anext(iterator), 1)
-        except StopAsyncIteration:
-            # All iterators should be cancelled and print this message.
-            print("Iterator was cancelled normally")
-        except (Exception, asyncio.CancelledError) as e:
-            raise AssertionError() from e
-
-
 def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_PORT", "5678")
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index ad111a1ebd5be..866365ac18eb9 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -34,7 +34,7 @@ from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 
 
 def run_vllm(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 7cbe9c69435c3..f33fce7716a98 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -34,7 +34,8 @@ from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import as_list, merge_async_iterators
+from vllm.utils import as_list
+from vllm.utils.async_utils import merge_async_iterators
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index e2b940ef00c09..4c05d9f57fa63 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -40,6 +40,7 @@ from vllm.outputs import (
 )
 from vllm.pooling_params import PoolingParams
 from vllm.utils import chunk_list
+from vllm.utils.async_utils import merge_async_iterators
 
 logger = init_logger(__name__)
 
@@ -387,8 +388,6 @@ class EmbeddingMixin(OpenAIServing):
                 )
                 generators.append(generator)
 
-            from vllm.utils import merge_async_iterators
-
             ctx.result_generator = merge_async_iterators(*generators)
 
             return None
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index c318c0f425bd2..6464d4f9e6751 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -90,14 +90,13 @@ from vllm.tracing import (
     log_tracing_disabled_warning,
 )
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import (
+from vllm.utils import is_list_of, random_uuid
+from vllm.utils.async_utils import (
     AsyncMicrobatchTokenizer,
     collect_from_async_generator,
-    is_list_of,
+    make_async,
     merge_async_iterators,
-    random_uuid,
 )
-from vllm.utils.func import make_async
 from vllm.v1.engine import EngineCoreRequest
 
 logger = init_logger(__name__)
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index aa81a233b2979..7a27348da35b8 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -36,7 +36,7 @@ from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
 from vllm.outputs import PoolingOutput, PoolingRequestOutput
 from vllm.tasks import SupportedTask
-from vllm.utils import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index e5c7f80a17533..9cbfc9791819e 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -37,8 +37,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import merge_async_iterators
-from vllm.utils.func import make_async
+from vllm.utils.async_utils import make_async, merge_async_iterators
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 4f1213b097306..63487a6ed0072 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -17,7 +17,7 @@ from vllm.inputs.data import TextPrompt as EngineTextPrompt
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import AsyncMicrobatchTokenizer
+from vllm.utils.async_utils import AsyncMicrobatchTokenizer
 
 
 @dataclass(frozen=True)
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 093d5e97fd3e4..9de2249f6c050 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -17,7 +17,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import ExecuteModelRequest
 from vllm.tasks import SupportedTask
-from vllm.utils.func import make_async
+from vllm.utils.async_utils import make_async
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.worker.worker_base import WorkerBase
 
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index a57b64152f49c..b41466a6a7705 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -20,12 +20,11 @@ from vllm.platforms import current_platform
 from vllm.ray.ray_env import get_env_vars_to_copy
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import (
-    _run_task_with_lock,
     get_distributed_init_method,
     get_ip,
     get_open_port,
 )
-from vllm.utils.func import make_async
+from vllm.utils.async_utils import make_async
 from vllm.v1.outputs import SamplerOutput
 
 if ray is not None:
@@ -748,3 +747,9 @@ class RayDistributedExecutor(DistributedExecutorBase):
         # Assume that the Ray workers are healthy.
         # TODO: check the health of the Ray workers
         return
+
+
+async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, **kwargs):
+    """Utility function to run async task in a lock"""
+    async with lock:
+        return await task(*args, **kwargs)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 5fd94b7b40492..99a9225cb6a42 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import asyncio
 import contextlib
 import datetime
 import enum
@@ -38,10 +37,8 @@ from argparse import (
     RawDescriptionHelpFormatter,
     _ArgumentGroup,
 )
-from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import UserDict, defaultdict
 from collections.abc import (
-    AsyncGenerator,
     Callable,
     Collection,
     Generator,
@@ -51,7 +48,6 @@ from collections.abc import (
     Mapping,
     Sequence,
 )
-from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures.process import ProcessPoolExecutor
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
@@ -82,7 +78,6 @@ import zmq.asyncio
 from packaging import version
 from packaging.version import Version
 from torch.library import Library
-from transformers.tokenization_utils_base import BatchEncoding
 from typing_extensions import Never, TypeIs, assert_never
 
 import vllm.envs as envs
@@ -223,278 +218,12 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-class AsyncMicrobatchTokenizer:
-    """Asynchronous tokenizer with micro-batching.
-
-    Pulls pending encode/decode requests from a queue and batches them
-    up to reduce overhead. A single-thread ThreadPoolExecutor is used
-    so the event loop stays responsive.
-    """
-
-    def __init__(
-        self,
-        tokenizer,
-        max_batch_size: int = 32,
-        batch_wait_timeout_s: float = 0.002,
-    ) -> None:
-        self.tokenizer = tokenizer
-        self.max_batch_size = max_batch_size
-        self.batch_wait_timeout_s = batch_wait_timeout_s
-
-        self._loop = asyncio.get_running_loop()
-        self._queues: dict[
-            tuple,
-            asyncio.Queue[
-                tuple[str, dict, asyncio.Future] | tuple[list[int], asyncio.Future]
-            ],
-        ] = {}
-        self._batcher_tasks: list[asyncio.Task] = []
-
-        # Single-thread executor for blocking tokenizer calls.
-        self._executor = ThreadPoolExecutor(max_workers=1)
-
-    # === Public async API ===
-    async def __call__(self, prompt, **kwargs):
-        result_future: asyncio.Future = self._loop.create_future()
-        key = self._queue_key("encode", kwargs)
-        queue = self._get_queue(self._loop, key)
-        await queue.put((prompt, kwargs, result_future))
-        return await result_future
-
-    async def decode(self, token_ids, **kwargs):
-        result_future: asyncio.Future = self._loop.create_future()
-        key = self._queue_key("decode", kwargs)
-        queue = self._get_queue(self._loop, key)
-        await queue.put((token_ids, result_future))
-        return await result_future
-
-    # === Internal helpers ===
-    def _get_queue(
-        self, loop: asyncio.AbstractEventLoop, key: tuple
-    ) -> asyncio.Queue[
-        tuple[str, dict, asyncio.Future] | tuple[list[int], asyncio.Future]
-    ]:
-        """Get the request queue for the given operation key, creating a new
-        queue and batcher task if needed."""
-        queue = self._queues.get(key)
-        if queue is None:
-            self._queues[key] = queue = asyncio.Queue()
-            if key[0] == "encode":
-                can_batch = key[1] != "other"
-                coro = self._batch_encode_loop(queue, can_batch)
-            else:
-                assert key[0] == "decode", f"Unknown operation type: {key[0]}."
-                coro = self._batch_decode_loop(queue)
-            self._batcher_tasks.append(loop.create_task(coro))
-        return queue
-
-    async def _batch_encode_loop(self, queue: asyncio.Queue, can_batch: bool):
-        """Batch incoming encode requests for efficiency."""
-        while True:
-            prompt, kwargs, result_future = await queue.get()
-            prompts = [prompt]
-            kwargs_list = [kwargs]
-            result_futures = [result_future]
-            deadline = self._loop.time() + self.batch_wait_timeout_s
-
-            while len(prompts) < self.max_batch_size:
-                timeout = deadline - self._loop.time()
-                if timeout <= 0:
-                    break
-                try:
-                    prompt, kwargs, result_future = await asyncio.wait_for(
-                        queue.get(), timeout
-                    )
-                    prompts.append(prompt)
-                    result_futures.append(result_future)
-                    if not can_batch:
-                        kwargs_list.append(kwargs)
-                except asyncio.TimeoutError:
-                    break
-
-            try:
-                # If every request uses identical kwargs we can run a single
-                # batched tokenizer call for a big speed-up.
-                if can_batch and len(prompts) > 1:
-                    batch_encode_fn = partial(self.tokenizer, prompts, **kwargs)
-                    results = await self._loop.run_in_executor(
-                        self._executor, batch_encode_fn
-                    )
-
-                    for i, fut in enumerate(result_futures):
-                        if not fut.done():
-                            data = {k: v[i] for k, v in results.items()}
-                            fut.set_result(BatchEncoding(data))
-                else:
-                    encode_fn = lambda prompts=prompts, kwargs=kwargs_list: [
-                        self.tokenizer(p, **kw) for p, kw in zip(prompts, kwargs)
-                    ]
-                    results = await self._loop.run_in_executor(
-                        self._executor, encode_fn
-                    )
-
-                    for fut, res in zip(result_futures, results):
-                        if not fut.done():
-                            fut.set_result(res)
-            except Exception as e:
-                for fut in result_futures:
-                    if not fut.done():
-                        fut.set_exception(e)
-
-    async def _batch_decode_loop(self, queue: asyncio.Queue):
-        """Batch incoming decode requests for efficiency."""
-        while True:
-            token_ids, result_future = await queue.get()
-            token_ids_list = [token_ids]
-            result_futures = [result_future]
-            deadline = self._loop.time() + self.batch_wait_timeout_s
-
-            while len(token_ids_list) < self.max_batch_size:
-                timeout = deadline - self._loop.time()
-                if timeout <= 0:
-                    break
-                try:
-                    token_ids, result_future = await asyncio.wait_for(
-                        queue.get(), timeout
-                    )
-                    token_ids_list.append(token_ids)
-                    result_futures.append(result_future)
-                except asyncio.TimeoutError:
-                    break
-
-            try:
-                # Perform a single batched decode call for all requests
-                results = await self._loop.run_in_executor(
-                    self._executor, self.tokenizer.batch_decode, token_ids_list
-                )
-                for fut, res in zip(result_futures, results):
-                    if not fut.done():
-                        fut.set_result(res)
-            except Exception as e:
-                for fut in result_futures:
-                    if not fut.done():
-                        fut.set_exception(e)
-
-    def _queue_key(self, op: str, kwargs: dict) -> tuple:
-        """
-        Return a normalized key describing operation + kwargs.
-
-        - `add_special_tokens`: {True/False}
-        - `truncation`: {True/False}
-          - If `truncation` is False (`max_length` is None),
-            returns a key for a can_batch queue.
-          - If `truncation` is True and `max_length` is None or equals
-            `tokenizer.model_max_length`, returns a key for a can_batch queue.
-          - Otherwise, returns a key for a cannot_batch queue.
-
-        Examples:
-          - Decode: ("decode",)
-          - Encode typical:
-            ("encode", add_special_tokens, bool_truncation, max_length_label)
-          - Fallback: ("encode", "other")
-        """
-
-        if op == "decode":
-            return ("decode",)
-
-        add_special_tokens = kwargs.get("add_special_tokens", True)
-        truncation = kwargs.get("truncation", False)
-        max_length = kwargs.get("max_length")
-
-        if not truncation:
-            return "encode", add_special_tokens, False, None
-
-        model_max = getattr(self.tokenizer, "model_max_length", None)
-        if max_length is None or (model_max is not None and max_length == model_max):
-            return "encode", add_special_tokens, True, "model_max"
-
-        return "encode", "other"
-
-    def __del__(self):
-        if (
-            (tasks := getattr(self, "_batcher_tasks", None))
-            and (loop := getattr(self, "_loop", None))
-            and not loop.is_closed()
-        ):
-
-            def cancel_tasks():
-                for task in tasks:
-                    task.cancel()
-
-            loop.call_soon_threadsafe(cancel_tasks)
-
-
-def cancel_task_threadsafe(task: Task):
-    if task and not task.done():
-        run_in_loop(task.get_loop(), task.cancel)
-
-
 def close_sockets(sockets: Sequence[zmq.Socket | zmq.asyncio.Socket]):
     for sock in sockets:
         if sock is not None:
             sock.close(linger=0)
 
 
-def run_in_loop(loop: AbstractEventLoop, function: Callable, *args):
-    if in_loop(loop):
-        function(*args)
-    elif not loop.is_closed():
-        loop.call_soon_threadsafe(function, *args)
-
-
-def in_loop(event_loop: AbstractEventLoop) -> bool:
-    try:
-        return asyncio.get_running_loop() == event_loop
-    except RuntimeError:
-        return False
-
-
-async def merge_async_iterators(
-    *iterators: AsyncGenerator[T, None],
-) -> AsyncGenerator[tuple[int, T], None]:
-    """Merge multiple asynchronous iterators into a single iterator.
-
-    This method handle the case where some iterators finish before others.
-    When it yields, it yields a tuple (i, item) where i is the index of the
-    iterator that yields the item.
-    """
-    if len(iterators) == 1:
-        # Fast-path single iterator case.
-        async for item in iterators[0]:
-            yield 0, item
-        return
-
-    loop = asyncio.get_running_loop()
-
-    awaits = {loop.create_task(anext(it)): (i, it) for i, it in enumerate(iterators)}
-    try:
-        while awaits:
-            done, _ = await asyncio.wait(awaits.keys(), return_when=FIRST_COMPLETED)
-            for d in done:
-                pair = awaits.pop(d)
-                try:
-                    item = await d
-                    i, it = pair
-                    awaits[loop.create_task(anext(it))] = pair
-                    yield i, item
-                except StopAsyncIteration:
-                    pass
-    finally:
-        # Cancel any remaining iterators
-        for f, (_, it) in awaits.items():
-            with contextlib.suppress(BaseException):
-                f.cancel()
-                await it.aclose()
-
-
-async def collect_from_async_generator(iterator: AsyncGenerator[T, None]) -> list[T]:
-    """Collect all items from an async generator into a list."""
-    items = []
-    async for item in iterator:
-        items.append(item)
-    return items
-
-
 def get_ip() -> str:
     host_ip = envs.VLLM_HOST_IP
     if "HOST_IP" in os.environ and "VLLM_HOST_IP" not in os.environ:
@@ -1803,12 +1532,6 @@ class FlexibleArgumentParser(ArgumentParser):
         return processed_args
 
 
-async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, **kwargs):
-    """Utility function to run async task in a lock"""
-    async with lock:
-        return await task(*args, **kwargs)
-
-
 # Using dynamo with vLLM doesn't really work well with PyTorch versions < 2.4.0.
 # In particular, the FakeScalarType is not supported for earlier versions of
 # PyTorch which breaks dynamo for any ops registered using ScalarType.
diff --git a/vllm/utils/async_utils.py b/vllm/utils/async_utils.py
new file mode 100644
index 0000000000000..aeabd808add50
--- /dev/null
+++ b/vllm/utils/async_utils.py
@@ -0,0 +1,299 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Contains helpers related to asynchronous code."""
+
+import asyncio
+import contextlib
+from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
+from collections.abc import AsyncGenerator, Awaitable, Callable
+from concurrent.futures import Executor, ThreadPoolExecutor
+from functools import partial
+from typing import TypeVar
+
+from transformers.tokenization_utils_base import BatchEncoding
+from typing_extensions import ParamSpec
+
+P = ParamSpec("P")
+T = TypeVar("T")
+
+
+class AsyncMicrobatchTokenizer:
+    """Asynchronous tokenizer with micro-batching.
+
+    Pulls pending encode/decode requests from a queue and batches them
+    up to reduce overhead. A single-thread ThreadPoolExecutor is used
+    so the event loop stays responsive.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        max_batch_size: int = 32,
+        batch_wait_timeout_s: float = 0.002,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.max_batch_size = max_batch_size
+        self.batch_wait_timeout_s = batch_wait_timeout_s
+
+        self._loop = asyncio.get_running_loop()
+        self._queues: dict[
+            tuple,
+            asyncio.Queue[tuple[str, dict, Future] | tuple[list[int], Future]],
+        ] = {}
+        self._batcher_tasks: list[Task] = []
+
+        # Single-thread executor for blocking tokenizer calls.
+        self._executor = ThreadPoolExecutor(max_workers=1)
+
+    # === Public async API ===
+    async def __call__(self, prompt, **kwargs):
+        result_future: Future = self._loop.create_future()
+        key = self._queue_key("encode", kwargs)
+        queue = self._get_queue(self._loop, key)
+        await queue.put((prompt, kwargs, result_future))
+        return await result_future
+
+    async def decode(self, token_ids, **kwargs):
+        result_future: Future = self._loop.create_future()
+        key = self._queue_key("decode", kwargs)
+        queue = self._get_queue(self._loop, key)
+        await queue.put((token_ids, result_future))
+        return await result_future
+
+    # === Internal helpers ===
+    def _get_queue(
+        self, loop: asyncio.AbstractEventLoop, key: tuple
+    ) -> asyncio.Queue[tuple[str, dict, Future] | tuple[list[int], Future]]:
+        """Get the request queue for the given operation key, creating a new
+        queue and batcher task if needed."""
+        queue = self._queues.get(key)
+        if queue is None:
+            self._queues[key] = queue = asyncio.Queue()
+            if key[0] == "encode":
+                can_batch = key[1] != "other"
+                coro = self._batch_encode_loop(queue, can_batch)
+            else:
+                assert key[0] == "decode", f"Unknown operation type: {key[0]}."
+                coro = self._batch_decode_loop(queue)
+            self._batcher_tasks.append(loop.create_task(coro))
+        return queue
+
+    async def _batch_encode_loop(self, queue: asyncio.Queue, can_batch: bool):
+        """Batch incoming encode requests for efficiency."""
+        while True:
+            prompt, kwargs, result_future = await queue.get()
+            prompts = [prompt]
+            kwargs_list = [kwargs]
+            result_futures = [result_future]
+            deadline = self._loop.time() + self.batch_wait_timeout_s
+
+            while len(prompts) < self.max_batch_size:
+                timeout = deadline - self._loop.time()
+                if timeout <= 0:
+                    break
+                try:
+                    prompt, kwargs, result_future = await asyncio.wait_for(
+                        queue.get(), timeout
+                    )
+                    prompts.append(prompt)
+                    result_futures.append(result_future)
+                    if not can_batch:
+                        kwargs_list.append(kwargs)
+                except asyncio.TimeoutError:
+                    break
+
+            try:
+                # If every request uses identical kwargs we can run a single
+                # batched tokenizer call for a big speed-up.
+                if can_batch and len(prompts) > 1:
+                    batch_encode_fn = partial(self.tokenizer, prompts, **kwargs)
+                    results = await self._loop.run_in_executor(
+                        self._executor, batch_encode_fn
+                    )
+
+                    for i, fut in enumerate(result_futures):
+                        if not fut.done():
+                            data = {k: v[i] for k, v in results.items()}
+                            fut.set_result(BatchEncoding(data))
+                else:
+                    encode_fn = lambda prompts=prompts, kwargs=kwargs_list: [
+                        self.tokenizer(p, **kw) for p, kw in zip(prompts, kwargs)
+                    ]
+                    results = await self._loop.run_in_executor(
+                        self._executor, encode_fn
+                    )
+
+                    for fut, res in zip(result_futures, results):
+                        if not fut.done():
+                            fut.set_result(res)
+            except Exception as e:
+                for fut in result_futures:
+                    if not fut.done():
+                        fut.set_exception(e)
+
+    async def _batch_decode_loop(self, queue: asyncio.Queue):
+        """Batch incoming decode requests for efficiency."""
+        while True:
+            token_ids, result_future = await queue.get()
+            token_ids_list = [token_ids]
+            result_futures = [result_future]
+            deadline = self._loop.time() + self.batch_wait_timeout_s
+
+            while len(token_ids_list) < self.max_batch_size:
+                timeout = deadline - self._loop.time()
+                if timeout <= 0:
+                    break
+                try:
+                    token_ids, result_future = await asyncio.wait_for(
+                        queue.get(), timeout
+                    )
+                    token_ids_list.append(token_ids)
+                    result_futures.append(result_future)
+                except asyncio.TimeoutError:
+                    break
+
+            try:
+                # Perform a single batched decode call for all requests
+                results = await self._loop.run_in_executor(
+                    self._executor, self.tokenizer.batch_decode, token_ids_list
+                )
+                for fut, res in zip(result_futures, results):
+                    if not fut.done():
+                        fut.set_result(res)
+            except Exception as e:
+                for fut in result_futures:
+                    if not fut.done():
+                        fut.set_exception(e)
+
+    def _queue_key(self, op: str, kwargs: dict) -> tuple:
+        """
+        Return a normalized key describing operation + kwargs.
+
+        - `add_special_tokens`: {True/False}
+        - `truncation`: {True/False}
+          - If `truncation` is False (`max_length` is None),
+            returns a key for a can_batch queue.
+          - If `truncation` is True and `max_length` is None or equals
+            `tokenizer.model_max_length`, returns a key for a can_batch queue.
+          - Otherwise, returns a key for a cannot_batch queue.
+
+        Examples:
+          - Decode: ("decode",)
+          - Encode typical:
+            ("encode", add_special_tokens, bool_truncation, max_length_label)
+          - Fallback: ("encode", "other")
+        """
+
+        if op == "decode":
+            return ("decode",)
+
+        add_special_tokens = kwargs.get("add_special_tokens", True)
+        truncation = kwargs.get("truncation", False)
+        max_length = kwargs.get("max_length")
+
+        if not truncation:
+            return "encode", add_special_tokens, False, None
+
+        model_max = getattr(self.tokenizer, "model_max_length", None)
+        if max_length is None or (model_max is not None and max_length == model_max):
+            return "encode", add_special_tokens, True, "model_max"
+
+        return "encode", "other"
+
+    def __del__(self):
+        if (
+            (tasks := getattr(self, "_batcher_tasks", None))
+            and (loop := getattr(self, "_loop", None))
+            and not loop.is_closed()
+        ):
+
+            def cancel_tasks():
+                for task in tasks:
+                    task.cancel()
+
+            loop.call_soon_threadsafe(cancel_tasks)
+
+
+def cancel_task_threadsafe(task: Task):
+    if task and not task.done():
+        run_in_loop(task.get_loop(), task.cancel)
+
+
+def make_async(
+    func: Callable[P, T],
+    executor: Executor | None = None,
+) -> Callable[P, Awaitable[T]]:
+    """
+    Take a blocking function, and run it on in an executor thread.
+
+    This function prevents the blocking function from blocking the
+    asyncio event loop.
+    The code in this function needs to be thread safe.
+    """
+
+    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> Future[T]:
+        loop = asyncio.get_event_loop()
+        p_func = partial(func, *args, **kwargs)
+        return loop.run_in_executor(executor=executor, func=p_func)
+
+    return _async_wrapper
+
+
+def run_in_loop(loop: AbstractEventLoop, function: Callable, *args):
+    if in_loop(loop):
+        function(*args)
+    elif not loop.is_closed():
+        loop.call_soon_threadsafe(function, *args)
+
+
+def in_loop(event_loop: AbstractEventLoop) -> bool:
+    try:
+        return asyncio.get_running_loop() == event_loop
+    except RuntimeError:
+        return False
+
+
+async def merge_async_iterators(
+    *iterators: AsyncGenerator[T, None],
+) -> AsyncGenerator[tuple[int, T], None]:
+    """Merge multiple asynchronous iterators into a single iterator.
+
+    This method handle the case where some iterators finish before others.
+    When it yields, it yields a tuple (i, item) where i is the index of the
+    iterator that yields the item.
+    """
+    if len(iterators) == 1:
+        # Fast-path single iterator case.
+        async for item in iterators[0]:
+            yield 0, item
+        return
+
+    loop = asyncio.get_running_loop()
+
+    awaits = {loop.create_task(anext(it)): (i, it) for i, it in enumerate(iterators)}
+    try:
+        while awaits:
+            done, _ = await asyncio.wait(awaits.keys(), return_when=FIRST_COMPLETED)
+            for d in done:
+                pair = awaits.pop(d)
+                try:
+                    item = await d
+                    i, it = pair
+                    awaits[loop.create_task(anext(it))] = pair
+                    yield i, item
+                except StopAsyncIteration:
+                    pass
+    finally:
+        # Cancel any remaining iterators
+        for f, (_, it) in awaits.items():
+            with contextlib.suppress(BaseException):
+                f.cancel()
+                await it.aclose()
+
+
+async def collect_from_async_generator(iterator: AsyncGenerator[T, None]) -> list[T]:
+    """Collect all items from an async generator into a list."""
+    items = []
+    async for item in iterator:
+        items.append(item)
+    return items
diff --git a/vllm/utils/func.py b/vllm/utils/func.py
index bd26b29d5f6dc..c061a0dad5525 100644
--- a/vllm/utils/func.py
+++ b/vllm/utils/func.py
@@ -6,12 +6,10 @@ Contains helpers that are applied to functions.
 This is similar in concept to the `functools` module.
 """
 
-import asyncio
-import concurrent.futures
 import inspect
 import threading
 import warnings
-from collections.abc import Awaitable, Callable, Mapping
+from collections.abc import Callable, Mapping
 from functools import lru_cache, partial, wraps
 from typing import Any, TypeVar
 
@@ -32,26 +30,6 @@ def identity(value: T, **kwargs) -> T:
     return value
 
 
-def make_async(
-    func: Callable[P, T],
-    executor: concurrent.futures.Executor | None = None,
-) -> Callable[P, Awaitable[T]]:
-    """
-    Take a blocking function, and run it on in an executor thread.
-
-    This function prevents the blocking function from blocking the
-    asyncio event loop.
-    The code in this function needs to be thread safe.
-    """
-
-    def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future[T]:
-        loop = asyncio.get_event_loop()
-        p_func = partial(func, *args, **kwargs)
-        return loop.run_in_executor(executor=executor, func=p_func)
-
-    return _async_wrapper
-
-
 def run_once(f: Callable[P, None]) -> Callable[P, None]:
     def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
         if wrapper.has_run:  # type: ignore[attr-defined]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index c8fb30f96c0a0..ed9d82ca5373e 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -29,7 +29,8 @@ from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
 from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, as_list, cancel_task_threadsafe, cdiv
+from vllm.utils import Device, as_list, cdiv
+from vllm.utils.async_utils import cancel_task_threadsafe
 from vllm.utils.func import deprecate_kwargs
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index c800d0d279af1..a9deebc7e1f5c 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -27,9 +27,9 @@ from vllm.utils import (
     close_sockets,
     get_open_port,
     get_open_zmq_inproc_path,
-    in_loop,
     make_zmq_socket,
 )
+from vllm.utils.async_utils import in_loop
 from vllm.v1.engine import (
     EngineCoreOutputs,
     EngineCoreRequest,

From d3cbaa08dc94a75edc129df42966ccbeb4b15f5d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 15 Oct 2025 17:01:09 +0100
Subject: [PATCH 41/51] Lower sevarity of log when model info cache misses due
 to exception (#26917)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c43964285c052..d119c161f6b36 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -581,7 +581,7 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
             # file not changed, use cached _ModelInfo properties
             return _ModelInfo(**mi_dict["modelinfo"])
         except Exception:
-            logger.exception(
+            logger.debug(
                 ("Cached model info for class %s.%s error. "),
                 self.module_name,
                 self.class_name,

From 4794c2bd92d9066babef217cbb0a1b9d3d85f483 Mon Sep 17 00:00:00 2001
From: Pradeep Dasigi <pradeep.dasigi@gmail.com>
Date: Wed, 15 Oct 2025 09:36:12 -0700
Subject: [PATCH 42/51] Olmo 3 tool parser and tests (#26143)

Signed-off-by: Pradeep Dasigi <pradeepd@allenai.org>
---
 docs/features/tool_calling.md                 |  10 +
 .../tool_parsers/test_olmo3_tool_parser.py    | 243 ++++++++++++
 .../openai/tool_parsers/__init__.py           |   2 +
 .../openai/tool_parsers/olmo3_tool_parser.py  | 368 ++++++++++++++++++
 4 files changed, 623 insertions(+)
 create mode 100644 tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 02a700c09d391..5829bfa44e428 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -352,6 +352,16 @@ Supported models:
 
 Flags: `--tool-call-parser qwen3_xml`
 
+### Olmo 3 Models (`olmo3`)
+
+Olmo 3 models output tool calls in a format that is very similar to the one expected by the `pythonic` parser (see below), with a few differences. Each tool call is a pythonic string, but the parallel tool calls are newline-delimited, and the calls are wrapped within XML tags as `<function_calls>..</function_calls>`. In addition, the parser also allows JSON boolean and null literals (`true`, `false`, and `null`) in addition to the pythonic ones (`True`, `False`, and `None`).
+
+Supported models:
+
+* TODO (will be updated after Olmo 3 release)
+
+Flags: `--tool-call-parser olmo3`
+
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
new file mode 100644
index 0000000000000..224196b9a0b2e
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -0,0 +1,243 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+
+# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
+SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "San Francisco", "metric": "celsius"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = (
+    "register_user(name='John Doe', "
+    "age=37, "
+    "address={'city': 'San Francisco', 'state': 'CA'}, "
+    "role=None, "
+    "passed_test=True, "
+    "aliases=['John', 'Johnny'])"
+)
+MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS = (
+    "register_user(name='John Doe', "
+    "age=37, "
+    "address={'city': 'San Francisco', 'state': 'CA'}, "
+    "role=null, "
+    "passed_test=true, "
+    "aliases=['John', 'Johnny'])"
+)
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "John Doe", '
+    '"age": 37, '
+    '"address": {"city": "San Francisco", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments="{}",
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
+)
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+TEST_CASES = [
+    pytest.param(
+        True,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL],
+        id="simple_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_streaming_json_literals",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{MORE_TYPES_FUNCTION_OUTPUT_JSON_LITERALS}</function_calls>",
+        [MORE_TYPES_FUNCTION_CALL],
+        id="more_types_nonstreaming_json_literals",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{PARAMETERLESS_FUNCTION_OUTPUT}</function_calls>",
+        [PARAMETERLESS_FUNCTION_CALL],
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{EMPTY_DICT_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_DICT_FUNCTION_CALL],
+        id="empty_dict_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
+        [EMPTY_LIST_FUNCTION_CALL],
+        id="empty_list_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{ESCAPED_STRING_FUNCTION_OUTPUT}</function_calls>",
+        [ESCAPED_STRING_FUNCTION_CALL],
+        id="escaped_string_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        f"<function_calls>{SIMPLE_FUNCTION_OUTPUT}\n{MORE_TYPES_FUNCTION_OUTPUT}</function_calls>",
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        id="parallel_calls_nonstreaming",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", TEST_CASES)
+def test_tool_call(
+    streaming: bool, model_output: str, expected_tool_calls: list[FunctionCall]
+):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert content is None
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps():
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
+    model_output_deltas = [
+        "<function_calls>get_weather(city='San",
+        " Francisco', metric='celsius')\n"
+        f"{PARAMETERLESS_FUNCTION_OUTPUT}\n"
+        f"{EMPTY_LIST_FUNCTION_OUTPUT}</function_calls>",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False
+    )
+
+    assert reconstructor.other_content == ""
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool):
+    """test regex timeout is handled gracefully"""
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(mock_tokenizer)
+
+    fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
+        content, tool_calls = run_tool_extraction(
+            tool_parser, fake_problematic_input, streaming=streaming
+        )
+
+        # should treat as regular text when regex times out
+        assert content == fake_problematic_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 859da8392fc07..a72772f59cf2f 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -18,6 +18,7 @@ from .llama_tool_parser import Llama3JsonToolParser
 from .longcat_tool_parser import LongcatFlashToolParser
 from .minimax_tool_parser import MinimaxToolParser
 from .mistral_tool_parser import MistralToolParser
+from .olmo3_tool_parser import Olmo3PythonicToolParser
 from .openai_tool_parser import OpenAIToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
@@ -45,6 +46,7 @@ __all__ = [
     "DeepSeekV31ToolParser",
     "Ernie45ToolParser",
     "xLAMToolParser",
+    "Olmo3PythonicToolParser",
     "MinimaxToolParser",
     "KimiK2ToolParser",
     "HunyuanA13BToolParser",
diff --git a/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py
new file mode 100644
index 0000000000000..ed5633aac02d4
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py
@@ -0,0 +1,368 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import json
+from collections.abc import Sequence
+from typing import Any
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+import vllm.envs as envs
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser,
+    ToolParserManager,
+)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class _UnexpectedAstError(Exception):
+    pass
+
+
+@ToolParserManager.register_module("olmo3")
+class Olmo3PythonicToolParser(ToolParser):
+    """
+    Tool call parser for Olmo 3 models that produce tool calls as
+    newline-separated pythonic strings.
+    Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set
+    Code copied from pythonic_tool_parser.py and updated to handle
+    - newline separated pythonic tool calls.
+    - argument values being null/true/false instead of Pythonic literals.
+    """
+
+    # TODO(mdepinet): Possible future improvements:
+    #   1. Support text + tools separated by either <|python_tag|> or \n\n
+    #   2. Support tools outside of a list (or separated by a semicolon).
+    #      This depends on item 1 for consistent streaming.
+    # Neither of these are necessary for e.g. ToolACE, but both would help make
+    # Llama3.2 models more reliable.
+
+    TOOL_CALL_REGEX = re.compile(
+        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+        re.DOTALL,
+    )
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+    # Rename for readability. This is NOT a tool id.
+    @property
+    def current_tool_index(self) -> int:
+        return self.current_tool_id
+
+    @current_tool_index.setter
+    def current_tool_index(self, value: int) -> None:
+        self.current_tool_id = value
+
+    def extract_tool_calls(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        original_model_output = model_output
+        # Remove xml tags.
+        match = re.search(
+            r"<function_calls>(.*?)</function_calls>", model_output, re.DOTALL
+        )
+        if match:
+            model_output = match.group(1).strip()
+        # Make the newline separated function calls into a list.
+        model_output = ", ".join(
+            [line.strip() for line in model_output.splitlines() if line.strip()]
+        )
+        model_output = f"[{model_output}]"
+
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = (
+                self.TOOL_CALL_REGEX.match(
+                    model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
+                )
+                is not None
+            )
+        except TimeoutError:
+            logger.warning("Regex timeout occurred when matching tool call pattern.")
+            logger.debug(
+                "Regex timeout occurred when matching user input: %s", model_output
+            )
+
+        if not is_tool_call_pattern:
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=original_model_output
+            )
+
+        try:
+            module = ast.parse(model_output)
+            parsed = getattr(module.body[0], "value", None)
+            if isinstance(parsed, ast.List) and all(
+                isinstance(e, ast.Call) for e in parsed.elts
+            ):
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=[
+                        _handle_single_tool(e)  # type: ignore
+                        for e in parsed.elts
+                    ],
+                    content=None,
+                )
+            else:
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls"
+                )
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # Treat as regular text
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=original_model_output
+            )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        # All function calls start with the <function_calls> tag.
+        # But since this is streaming, we may have seen only part of the tag.
+        if not current_text.startswith("<"):
+            return DeltaMessage(content=delta_text)
+
+        try:
+            # Remove xml tags.
+            if current_text.startswith("<function_calls>"):
+                current_text = current_text[len("<function_calls>") :]
+            if current_text.endswith("</function_calls>"):
+                current_text = current_text[: -len("</function_calls>")]
+
+            valid_and_added_text = _make_valid_python(current_text)
+            if valid_and_added_text is None:
+                return None
+            valid_text, added_text = valid_and_added_text
+
+            # Make the newline separated function calls into a list.
+            valid_text = ", ".join(
+                [line.strip() for line in valid_text.splitlines() if line.strip()]
+            )
+            valid_text = f"[{valid_text}]"
+            module = ast.parse(valid_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not isinstance(parsed, ast.List) or not all(
+                isinstance(e, ast.Call) for e in parsed.elts
+            ):
+                raise _UnexpectedAstError(
+                    "Tool output must be a sequence of newline-separated calls"
+                )
+            tool_calls = [
+                _handle_single_tool(e)  # type: ignore
+                for e in parsed.elts
+            ]
+
+            tool_deltas = []
+            for index, new_call in enumerate(tool_calls):
+                if index < self.current_tool_index:
+                    continue
+
+                self.current_tool_index = index
+                if len(self.streamed_args_for_tool) == index:
+                    self.streamed_args_for_tool.append("")
+
+                new_call_complete = index < len(tool_calls) - 1 or ")" not in added_text
+                if new_call_complete:
+                    self.current_tool_index += 1
+
+                withheld_suffix = added_text[:-1] if not new_call_complete else ""
+                if not new_call_complete and added_text[-1] == ")":
+                    # Function call is incomplete. Withhold the closing bracket.
+                    withheld_suffix = withheld_suffix + "}"
+                # Strings get single quotes in the model-produced string.
+                # JSON requires double quotes.
+                withheld_suffix = withheld_suffix.replace("'", '"')
+                delta = _compute_tool_delta(
+                    self.streamed_args_for_tool[index], new_call, index, withheld_suffix
+                )
+
+                if delta is not None:
+                    tool_deltas.append(delta)
+                    if (
+                        delta.function is not None
+                        and delta.function.arguments is not None
+                    ):
+                        self.streamed_args_for_tool[index] += delta.function.arguments
+
+            # HACK: serving_chat.py inspects the internal state of tool parsers
+            # when determining its final streaming delta, automatically
+            # adding autocompleted JSON.
+            # These two lines avoid that nonsense while ensuring finish_reason
+            # is set to tool_calls when at least one tool is called.
+            if tool_deltas and not self.prev_tool_call_arr:
+                self.prev_tool_call_arr = [{"arguments": {}}]
+
+            if tool_deltas:
+                return DeltaMessage(tool_calls=tool_deltas)
+            elif not added_text and self.current_tool_id > 0:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage(content="")
+            else:
+                return None
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
+            return None
+
+
+def _get_parameter_value(val: ast.expr) -> Any:
+    if isinstance(val, ast.Constant):
+        return val.value
+    elif isinstance(val, ast.Dict):
+        if not all(isinstance(k, ast.Constant) for k in val.keys):
+            raise _UnexpectedAstError("Dict tool call arguments must have literal keys")
+        return {
+            k.value: _get_parameter_value(v)  # type: ignore
+            for k, v in zip(val.keys, val.values)
+        }
+    elif isinstance(val, ast.List):
+        return [_get_parameter_value(v) for v in val.elts]
+    # The model may return function calls where the values are null/true/false
+    # because the system prompt has API description in json.
+    elif isinstance(val, ast.Name) and val.id in ["null", "true", "false"]:
+        if val.id == "null":
+            return None
+        elif val.id == "true":
+            return True
+        elif val.id == "false":
+            return False
+    else:
+        raise _UnexpectedAstError("Tool call arguments must be literals")
+
+
+def _handle_single_tool(call: ast.Call) -> ToolCall:
+    if not isinstance(call.func, ast.Name):
+        raise _UnexpectedAstError("Invalid tool call name")
+    function_name = call.func.id
+    arguments = {}
+    for keyword in call.keywords:
+        arguments[keyword.arg] = _get_parameter_value(keyword.value)
+    return ToolCall(
+        type="function",
+        function=FunctionCall(
+            name=function_name, arguments=json.dumps(arguments, ensure_ascii=False)
+        ),
+    )
+
+
+def _make_valid_python(text: str) -> tuple[str, str] | None:
+    bracket_stack = []
+    for index, char in enumerate(text):
+        if char in {"[", "(", "{"}:
+            bracket_stack.append(char)
+        elif char == "]":
+            if not bracket_stack or bracket_stack.pop() != "[":
+                raise _UnexpectedAstError("Mismatched square brackets")
+        elif char == ")":
+            if not bracket_stack or bracket_stack.pop() != "(":
+                raise _UnexpectedAstError("Mismatched parentheses")
+        elif char == "}":
+            if not bracket_stack or bracket_stack.pop() != "{":
+                raise _UnexpectedAstError("Mismatched curly braces")
+        elif char in {"'", '"'}:
+            if bracket_stack and bracket_stack[-1] == char:
+                if index > 0 and text[index - 1] == "\\":
+                    # Treat an escaped quote as a regular character
+                    pass
+                else:
+                    bracket_stack.pop()
+            elif bracket_stack and bracket_stack[-1] in {"'", '"'}:
+                # Double quote within a single quote string or vice versa.
+                pass
+            else:
+                bracket_stack.append(char)
+
+    text = text.rstrip()
+    if text.endswith("=") or text.endswith(":"):
+        # Since we have no type information for this property/parameter value,
+        # we can't fill in a valid value.
+        return None
+    if bracket_stack and bracket_stack[-1] == "{":
+        trailing_dict_text = text[: text.rfind("{")]
+        num_keys = trailing_dict_text.count(":")
+        num_values = trailing_dict_text.count(",")
+        if num_keys <= num_values:
+            return None  # Incomplete property name within parameter value
+    if bracket_stack and bracket_stack[-1] == "(":
+        trailing_params_text = text[: text.rfind("(")]
+        num_full_param_names = trailing_params_text.count("=")
+        num_full_param_values = trailing_params_text.count(",")
+        if num_full_param_names <= num_full_param_values:
+            return None  # Incomplete parameter name
+    if text.endswith(","):
+        text = text[:-1]
+    if (
+        bracket_stack
+        and bracket_stack[-1] == "["
+        and not text.endswith("[")
+        and not text.endswith(")")
+    ):
+        return None  # Incomplete function name
+
+    added_text = ""
+    for char in reversed(bracket_stack):
+        if char == "[":
+            added_text += "]"
+        elif char == "(":
+            added_text += ")"
+        elif char == "{":
+            added_text += "}"
+        elif char == "'":
+            added_text += "'"
+        elif char == '"':
+            added_text += '"'
+
+    return text + added_text, added_text
+
+
+def _compute_tool_delta(
+    previously_sent_args: str, new_call: ToolCall, index: int, withheld_suffix: str
+) -> DeltaToolCall | None:
+    new_call_args = new_call.function.arguments
+    if withheld_suffix:
+        assert new_call_args.endswith(withheld_suffix)
+        new_call_args = new_call_args[: -len(withheld_suffix)]
+    if not previously_sent_args:
+        return DeltaToolCall(
+            id=new_call.id,
+            type="function",
+            index=index,
+            function=DeltaFunctionCall(
+                name=new_call.function.name,
+                arguments=new_call_args,
+            ),
+        )
+
+    arg_diff = new_call_args[len(previously_sent_args) :]
+    return (
+        DeltaToolCall(
+            id=None, index=index, function=DeltaFunctionCall(arguments=arg_diff)
+        )
+        if arg_diff
+        else None
+    )

From 14f845634481d5223f4573461c6e2a4fe57eda98 Mon Sep 17 00:00:00 2001
From: Sam/Samuel <57896620+cern1710@users.noreply.github.com>
Date: Thu, 16 Oct 2025 01:44:03 +0900
Subject: [PATCH 43/51] [Feature]: Use pydantic validation in observability.py
 config (#26637)

Signed-off-by: Samuel Wu <cernunnos1710@gmail.com>
Signed-off-by: Sam/Samuel <57896620+cern1710@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/observability.py | 56 ++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 18 deletions(-)

diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index 592246c1c35f8..564c4f7aed419 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -5,6 +5,8 @@ import hashlib
 from functools import cached_property
 from typing import Any, Literal, cast
 
+from packaging.version import parse
+from pydantic import field_validator, model_validator
 from pydantic.dataclasses import dataclass
 
 from vllm import version
@@ -79,25 +81,43 @@ class ObservabilityConfig:
         hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
-    def __post_init__(self):
-        if (
-            self.collect_detailed_traces is not None
-            and len(self.collect_detailed_traces) == 1
-            and "," in self.collect_detailed_traces[0]
-        ):
-            self._parse_collect_detailed_traces()
+    @field_validator("show_hidden_metrics_for_version")
+    @classmethod
+    def _validate_show_hidden_metrics_for_version(cls, value: str | None) -> str | None:
+        if value is not None:
+            # Raises an exception if the string is not a valid version.
+            parse(value)
+        return value
 
-        from vllm.tracing import is_otel_available, otel_import_error_traceback
+    @field_validator("otlp_traces_endpoint")
+    @classmethod
+    def _validate_otlp_traces_endpoint(cls, value: str | None) -> str | None:
+        if value is not None:
+            from vllm.tracing import is_otel_available, otel_import_error_traceback
 
-        if not is_otel_available() and self.otlp_traces_endpoint is not None:
+            if not is_otel_available():
+                raise ValueError(
+                    "OpenTelemetry is not available. Unable to configure "
+                    "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
+                    f"installed. Original error:\n{otel_import_error_traceback}"
+                )
+        return value
+
+    @field_validator("collect_detailed_traces")
+    @classmethod
+    def _validate_collect_detailed_traces(
+        cls, value: list[DetailedTraceModules] | None
+    ) -> list[DetailedTraceModules] | None:
+        """Handle the legacy case where users might provide a comma-separated
+        string instead of a list of strings."""
+        if value is not None and len(value) == 1 and "," in value[0]:
+            value = cast(list[DetailedTraceModules], value[0].split(","))
+        return value
+
+    @model_validator(mode="after")
+    def _validate_tracing_config(self):
+        if self.collect_detailed_traces and not self.otlp_traces_endpoint:
             raise ValueError(
-                "OpenTelemetry is not available. Unable to configure "
-                "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
-                f"installed. Original error:\n{otel_import_error_traceback}"
+                "collect_detailed_traces requires `--otlp-traces-endpoint` to be set."
             )
-
-    def _parse_collect_detailed_traces(self):
-        assert isinstance(self.collect_detailed_traces, list)
-        self.collect_detailed_traces = cast(
-            list[DetailedTraceModules], self.collect_detailed_traces[0].split(",")
-        )
+        return self

From d7963752589f53b061da362b80663fab0aeee081 Mon Sep 17 00:00:00 2001
From: XiaobingZhang <xiaobingzhangupc@gmail.com>
Date: Thu, 16 Oct 2025 01:06:17 +0800
Subject: [PATCH 44/51] [ModelOpt] Remove NVFP4 MoE K%16==0 constraint (#26891)

Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>
---
 vllm/model_executor/layers/quantization/modelopt.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 0f0638899bf1e..79bf8109b8fd2 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1542,23 +1542,11 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             del layer.w2_input_scale_quant
         else:
             # Non-TRT-LLM processing (Cutlass or non-flashinfer)
-            assert layer.w13_weight_scale.shape[2] % 16 == 0, (
-                "Expected weight_scale.dim(1) to be divisible by 16"
-            )
-            assert layer.w13_weight_scale.dtype == torch.float8_e4m3fn, (
-                "Weight Blockscale must be represented as FP8-E4M3"
-            )
             w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
             layer.w13_weight_scale = Parameter(
                 w13_blockscale_swizzled, requires_grad=False
             )
 
-            assert layer.w2_weight_scale.shape[2] % 16 == 0, (
-                "Expected weight_scale.dim(1) to be divisible by 16"
-            )
-            assert layer.w2_weight_scale.dtype == torch.float8_e4m3fn, (
-                "Weight Blockscale must be represented as FP8-E4M3"
-            )
             w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
             layer.w2_weight_scale = Parameter(
                 w2_blockscale_swizzled, requires_grad=False

From a1063628a48e124f28eb84b621a38928dc7df09d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 15 Oct 2025 10:52:54 -0700
Subject: [PATCH 45/51] [Chore] Clean up CODEOWNERS (#26923)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .github/CODEOWNERS | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 61ac9fefc59f4..3fbc38d9a26c7 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,9 +5,7 @@
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
@@ -26,7 +24,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 
 # vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/flashinfer.py @mgoin
 /vllm/v1/attention/backends/triton_attn.py @tdoublep

From de92d916fe8a897b00a8adb0aab9ed9ec99f2b6c Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Wed, 15 Oct 2025 10:53:00 -0700
Subject: [PATCH 46/51] [NVIDIA] Add support for cudnn fp4 gemm via flashinfer
 (#26107)

Signed-off-by: kaixih <kaixih@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 vllm/envs.py                                  | 17 +++++---
 .../schemes/compressed_tensors_w4a4_nvfp4.py  | 40 ++++++++++++-------
 .../layers/quantization/modelopt.py           | 38 ++++++++++--------
 3 files changed, 57 insertions(+), 38 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index b5c7f325f670d..cb3dab51eff4d 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -191,6 +191,7 @@ if TYPE_CHECKING:
     VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
     VLLM_USE_TRTLLM_ATTENTION: str | None = None
+    VLLM_NVFP4_GEMM_BACKEND: str | None = None
     VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False
     VLLM_HAS_FLASHINFER_CUBIN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
@@ -1292,11 +1293,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # If set, it means we pre-downloaded cubin files and flashinfer will
     # read the cubin files directly.
     "VLLM_HAS_FLASHINFER_CUBIN": lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False),
-    # If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
-    # Otherwise, uses the first available of: flashinfer cutlass GEMM,
-    # vllm cutlass GEMM, marlin GEMM.
-    "VLLM_USE_TRTLLM_FP4_GEMM": lambda: bool(
-        int(os.getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))
+    # Supported options:
+    # - "flashinfer-cudnn": use flashinfer cudnn GEMM backend
+    # - "flashinfer-trtllm": use flashinfer trtllm GEMM backend
+    # - "flashinfer-cutlass": use flashinfer cutlass GEMM backend
+    # - <none>: automatically pick an available backend
+    "VLLM_NVFP4_GEMM_BACKEND": env_with_choices(
+        "VLLM_NVFP4_GEMM_BACKEND",
+        None,
+        ["flashinfer-cudnn", "flashinfer-trtllm", "flashinfer-cutlass"],
     ),
     # Controls garbage collection during CUDA graph capture.
     # If set to 0 (default), enables GC freezing to speed up capture time.
@@ -1492,7 +1497,6 @@ def compute_hash() -> str:
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
         "VLLM_USE_DEEP_GEMM_E8M0",
-        "VLLM_USE_TRTLLM_FP4_GEMM",
         "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
         "VLLM_USE_FLASHINFER_MOE_FP16",
         "VLLM_USE_FLASHINFER_MOE_FP8",
@@ -1524,6 +1528,7 @@ def compute_hash() -> str:
         "VLLM_ROCM_FP8_MFMA_PAGE_ATTN",
         "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE",
         "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING",
+        "VLLM_NVFP4_GEMM_BACKEND",
         "VLLM_USE_FBGEMM",
     ]
     for key in environment_variables_to_hash:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 192661c5b7ece..4127cd2d574bd 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -14,7 +14,10 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
 from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
     run_nvfp4_emulations,
 )
-from vllm.model_executor.layers.quantization.utils.quant_utils import swizzle_blockscale
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    cutlass_fp4_supported,
+    swizzle_blockscale,
+)
 from vllm.model_executor.parameter import (
     GroupQuantScaleParameter,
     ModelWeightParameter,
@@ -29,10 +32,12 @@ __all__ = ["CompressedTensorsW4A4Fp4"]
 
 class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
     def __init__(self):
-        if envs.VLLM_USE_TRTLLM_FP4_GEMM:
-            assert has_flashinfer(), "TRTLLM FP4 GEMM requires FlashInfer"
-            self.backend = "flashinfer-trtllm"
-            logger.info_once("Using flashinfer-trtllm for FP4")
+        self.backend = "none"
+        if envs.VLLM_NVFP4_GEMM_BACKEND is None:
+            if has_flashinfer():
+                self.backend = "flashinfer-cutlass"
+            elif cutlass_fp4_supported():
+                self.backend = "cutlass"
         elif envs.VLLM_USE_FBGEMM:
             self.backend = "fbgemm"
             try:
@@ -42,12 +47,17 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
                     "Backend fbgemm requires fbgemm.f4f4bf16 operator, "
                     "Please install with: pip install fbgemm-gpu-genai"
                 ) from exc
-            logger.info_once("Using FGBEMM-GPU-GENAI for FP4")
-        elif has_flashinfer():
-            self.backend = "flashinfer-cutlass"
-            logger.info_once("Using flashinfer-cutlass for FP4")
-        else:
-            self.backend = "cutlass"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
+            self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
+            assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+
+        if self.backend == "none":
+            raise ValueError(
+                "No valid NVFP4 GEMM backend found. "
+                "Please check your platform capability."
+            )
+
+        logger.info_once(f"Using {self.backend} for NVFP4 GEMM")
         self.group_size = 16
 
     @classmethod
@@ -184,10 +194,9 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
             layer.alpha,
             output_dtype,
         )
-        if self.backend == "flashinfer-trtllm":
-            out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm")
-        elif self.backend == "flashinfer-cutlass":
-            out = flashinfer_scaled_fp4_mm(*mm_args, backend="cutlass")
+        if self.backend.startswith("flashinfer-"):
+            backend_name = self.backend[len("flashinfer-") :]
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend=backend_name)
         elif self.backend == "fbgemm":
             out = torch.ops.fbgemm.f4f4bf16(
                 x_fp4,
@@ -198,6 +207,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
                 use_mx=False,
             ).to(output_dtype)
         else:
+            assert self.backend == "cutlass"
             out = cutlass_scaled_fp4_mm(*mm_args)
 
         if bias is not None:
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 79bf8109b8fd2..41f82de4ff0a6 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -926,22 +926,26 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         self.quant_config = quant_config
 
-        if envs.VLLM_USE_TRTLLM_FP4_GEMM:
-            assert has_flashinfer(), "TRTLLM FP4 GEMM requires FlashInfer"
-            self.backend = "flashinfer-trtllm"
-        elif has_flashinfer():
-            self.backend = "flashinfer-cutlass"
-        elif cutlass_fp4_supported():
-            self.backend = "cutlass"
-        elif is_fp4_marlin_supported():
-            self.backend = "marlin"
-        else:
+        self.backend = "none"
+        if envs.VLLM_NVFP4_GEMM_BACKEND is None:
+            if has_flashinfer():
+                self.backend = "flashinfer-cutlass"
+            elif cutlass_fp4_supported():
+                self.backend = "cutlass"
+            elif is_fp4_marlin_supported():
+                self.backend = "marlin"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
+            self.backend = envs.VLLM_NVFP4_GEMM_BACKEND
+            assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+
+        if self.backend == "none":
             raise ValueError(
-                "Current platform does not support NVFP4"
-                " quantization. Please use Blackwell and"
-                " above."
+                "No valid NVFP4 GEMM backend found. "
+                "Please check your platform capability."
             )
 
+        logger.info_once(f"Using {self.backend} for NVFP4 GEMM")
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -1109,11 +1113,11 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
             layer.alpha,
             output_dtype,
         )
-        if self.backend == "flashinfer-trtllm":
-            out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm")
-        elif self.backend == "flashinfer-cutlass":
-            out = flashinfer_scaled_fp4_mm(*mm_args, backend="cutlass")
+        if self.backend.startswith("flashinfer-"):
+            backend_name = self.backend[len("flashinfer-") :]
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend=backend_name)
         else:
+            assert self.backend == "cutlass"
             out = cutlass_scaled_fp4_mm(*mm_args)
 
         if bias is not None:

From 1f491aa0c80c2bf07e3ad37c4b6af8a869d48b5d Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Wed, 15 Oct 2025 11:54:41 -0700
Subject: [PATCH 47/51] Vectorize RMS norm variance using
 vectorize_read_with_alignment (#26234)

Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 csrc/layernorm_kernels.cu       | 18 +++++++++++++++---
 csrc/layernorm_quant_kernels.cu | 19 ++++++++++++++++---
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index 6c3685f6f7cdc..aa7927f09cbbf 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -2,6 +2,7 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
+#include "quantization/vectorization_utils.cuh"
 
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
     const float epsilon, const int num_tokens, const int hidden_size) {
   __shared__ float s_variance;
   float variance = 0.0f;
+  const scalar_t* input_row = input + blockIdx.x * input_stride;
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+  constexpr int VEC_SIZE = 8;
+  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      float x = static_cast<float>(vec.val[i]);
+      variance += x * x;
+    }
+  };
+  auto scalar_op = [&variance](const scalar_t& val) {
+    float x = static_cast<float>(val);
     variance += x * x;
-  }
+  };
+  vllm::vectorize_read_with_alignment<VEC_SIZE>(
+      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;
diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
index 0fc462194fcde..7f9a0bccdd348 100644
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -10,6 +10,7 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
+#include "quantization/vectorization_utils.cuh"
 
 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
   __shared__ float s_variance;
   float variance = 0.0f;
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+  const scalar_t* input_row = input + blockIdx.x * input_stride;
+
+  constexpr int VEC_SIZE = 8;
+  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      float x = static_cast<float>(vec.val[i]);
+      variance += x * x;
+    }
+  };
+  auto scalar_op = [&variance](const scalar_t& val) {
+    float x = static_cast<float>(val);
     variance += x * x;
-  }
+  };
+  vllm::vectorize_read_with_alignment<VEC_SIZE>(
+      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);
 
   using BlockReduce = cub::BlockReduce<float, 1024>;
   __shared__ typename BlockReduce::TempStorage reduceStore;

From 0b99f5d3023a84f120e6af7df355824e0f39af93 Mon Sep 17 00:00:00 2001
From: XiaobingZhang <xiaobingzhangupc@gmail.com>
Date: Thu, 16 Oct 2025 03:06:47 +0800
Subject: [PATCH 48/51] support flashinfer_fp4 moe for 5090 gpu (#26669)

Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../layers/quantization/utils/flashinfer_fp4_moe.py             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index ddb74a27dc122..5ce0188b60aed 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -32,7 +32,7 @@ def is_flashinfer_fp4_cutlass_moe_available() -> bool:
         envs.VLLM_USE_FLASHINFER_MOE_FP4
         and has_flashinfer_cutlass_fused_moe()
         and current_platform.is_cuda()
-        and current_platform.is_device_capability(100)
+        and current_platform.has_device_capability(100)
     )
 
 

From e5b438a24786decebd0b69340f024f18ea184d44 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 15 Oct 2025 16:18:50 -0400
Subject: [PATCH 49/51] [Bug] Temporally Disable `VLLM_ALLREDUCE_USE_SYMM_MEM`
 by Default (#26925)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/envs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index cb3dab51eff4d..6f40209dd0004 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -198,7 +198,7 @@ if TYPE_CHECKING:
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
     VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
-    VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
+    VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
     VLLM_TUNED_CONFIG_FOLDER: str | None = None
     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
     VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
@@ -1343,7 +1343,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     ),
     # Whether to use pytorch symmetric memory for allreduce
     "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
-        int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
+        int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))
     ),
     # Allows vllm to find tuned config under customized folder
     "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),

From 0a9ef0cfce131300fea219952b6e85c8dee45578 Mon Sep 17 00:00:00 2001
From: Adrian Abeyta <aabeyta@redhat.com>
Date: Wed, 15 Oct 2025 18:01:38 -0500
Subject: [PATCH 50/51] Move query quantization to attention layer for
 Flashinfer & Triton. (#26534)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Adrian Abeyta <aabeyta@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 tests/compile/test_fusion_attn.py         |  4 +++-
 vllm/attention/backends/abstract.py       | 24 +++++++++++++++--------
 vllm/attention/layer.py                   |  9 ++++++---
 vllm/v1/attention/backends/flash_attn.py  |  4 +++-
 vllm/v1/attention/backends/flashinfer.py  | 22 +++++++++++----------
 vllm/v1/attention/backends/triton_attn.py | 18 +++--------------
 6 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index a8d78daa32a1d..4d6f4b471a3a4 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -421,7 +421,9 @@ def test_attention_quant_pattern(
     ]
     if any(attn_fusion_supported):
         # Check quantization ops in the graph before and after fusion
-        test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=True)
+        # Note: fully_replaced=False because query quant ops remain in graph.
+        # Only output quant ops are fused into attention.
+        test_backend.check_before_ops([QUANT_OPS[quant_key]], fully_replaced=False)
 
     # access the underlying `AttnFusionPass` on the `LazyInitPass`
     assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 421b0c4beb370..fb2db4d0b0ec3 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -41,14 +41,6 @@ class AttentionBackend(ABC):
     # makes sure the output tensor is allocated inside the cudagraph.
     accept_output_buffer: bool = False
 
-    # Whether this backend supports receiving pre-quantized query input.
-    # If True, the attention layer will handle query quantization instead
-    # of the backend, allowing torch.compile to fuse quantization with
-    # previous operations.
-    # Needs to be worked through for all backends
-    # https://github.com/vllm-project/vllm/issues/25584
-    supports_quant_query_input: bool = False
-
     @staticmethod
     @abstractmethod
     def get_name() -> str:
@@ -199,6 +191,22 @@ class AttentionImpl(ABC, Generic[T]):
         """
         return False
 
+    def supports_quant_query_input(self) -> bool:
+        """
+        Check if this attention implementation supports pre-quantized query input.
+
+        When True, the attention layer will quantize queries before passing them
+        to this backend, allowing torch.compile to fuse the quantization with
+        previous operations. This is typically supported when using FP8 KV cache
+        with compatible attention kernels (e.g., TRT-LLM).
+        TODO add support to more backends:
+        https://github.com/vllm-project/vllm/issues/25584
+
+        Returns:
+            bool: True if the implementation can accept pre-quantized queries.
+        """
+        return False
+
 
 class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
     @abstractmethod
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 16c5799f7d0be..9f879f7272e21 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -36,6 +36,7 @@ from vllm.model_executor.models.vision import get_vit_attn_backend
 from vllm.platforms import current_platform
 from vllm.utils import GiB_bytes, direct_register_custom_op
 
+FP8_DTYPE = current_platform.fp8_dtype()
 logger = init_logger(__name__)
 USE_XFORMERS_OPS = None
 
@@ -304,7 +305,7 @@ class Attention(nn.Module, AttentionLayerBase):
         self.query_quant = None
         if (
             self.kv_cache_dtype.startswith("fp8")
-            and self.attn_backend.supports_quant_query_input
+            and self.impl.supports_quant_query_input()
         ):
             self.query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR)
 
@@ -329,7 +330,6 @@ class Attention(nn.Module, AttentionLayerBase):
         """
         if self.calculate_kv_scales:
             torch.ops.vllm.maybe_calc_kv_scales(query, key, value, self.layer_name)
-
         output_dtype = query.dtype
         if self.query_quant is not None:
             # quantizing with a simple torch operation enables
@@ -338,7 +338,10 @@ class Attention(nn.Module, AttentionLayerBase):
             # Otherwise queries are quantized using custom ops
             # which causes decoding overheads
             assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"}
-            query, _ = self.query_quant(query, self._q_scale)
+
+            # check if query quantization is supported
+            if self.impl.supports_quant_query_input():
+                query, _ = self.query_quant(query, self._q_scale)
 
         if self.use_output:
             output_shape = output_shape if output_shape is not None else query.shape
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 9e0c125d9edb7..087f995e0528b 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -49,7 +49,6 @@ logger = init_logger(__name__)
 
 class FlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
-    supports_quant_query_input: bool = True
 
     @classmethod
     def get_supported_dtypes(cls) -> list[torch.dtype]:
@@ -494,6 +493,9 @@ class FlashAttentionImpl(AttentionImpl):
                 "heads in the layer"
             )
 
+    def supports_quant_query_input(self) -> bool:
+        return True
+
     def forward(
         self,
         layer: torch.nn.Module,
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index ee32f7e2904f7..eb9f6a280d8f6 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -16,7 +16,6 @@ from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
 
-from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionImpl,
@@ -828,6 +827,12 @@ class FlashInferImpl(AttentionImpl):
             and quant_key in (kFp8StaticTensorSym, kNvfp4Quant)
         )
 
+    def supports_quant_query_input(self) -> bool:
+        if flashinfer_disable_q_quantization():
+            return False
+
+        return self.support_trtllm_attn
+
     def forward(
         self,
         layer: torch.nn.Module,
@@ -859,6 +864,12 @@ class FlashInferImpl(AttentionImpl):
             # Profiling run.
             return output.fill_(0)
 
+        # Ensure query dtype matches the expected dtype from attention metadata
+        assert attn_metadata.q_data_type == query.dtype, (
+            f"Query dtype mismatch: expected {attn_metadata.q_data_type}, "
+            f"got {query.dtype}"
+        )
+
         if self.bmm1_scale is None:
             self.bmm1_scale = layer._q_scale_float * layer._k_scale_float * self.scale
 
@@ -899,15 +910,6 @@ class FlashInferImpl(AttentionImpl):
                 elif output.dtype == FP4_DTYPE:
                     self.o_sf_scale = layer._o_scale_float
 
-        # Insert FP8 quant for query
-        if attn_metadata.q_data_type == FP8_DTYPE:
-            num_tokens, num_heads, head_size = query.shape
-            query, _ = ops.scaled_fp8_quant(
-                query.reshape((num_tokens, num_heads * head_size)).contiguous(),
-                layer._q_scale,
-            )
-            query = query.reshape((num_tokens, num_heads, head_size))
-
         # IMPORTANT!
         # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
         # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 9746a0eb58bd2..b1d34dbfd1729 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -32,11 +32,6 @@ from vllm.v1.attention.backends.utils import (
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 
-if current_platform.is_cuda_alike():
-    from vllm import _custom_ops as ops
-elif current_platform.is_xpu():
-    from vllm._ipex_ops import ipex_ops as ops
-
 logger = init_logger(__name__)
 
 
@@ -210,6 +205,9 @@ class TritonAttentionImpl(AttentionImpl):
     def fused_output_quant_supported(self, quant_key: QuantKey):
         return quant_key == kFp8StaticTensorSym
 
+    def supports_quant_query_input(self) -> bool:
+        return current_platform.is_cuda()
+
     def __init__(
         self,
         num_heads: int,
@@ -338,19 +336,9 @@ class TritonAttentionImpl(AttentionImpl):
             if key_cache.dtype != self.fp8_dtype:
                 key_cache = key_cache.view(self.fp8_dtype)
                 value_cache = value_cache.view(self.fp8_dtype)
-            num_tokens, num_heads, head_size = query.shape
             assert layer._q_scale_float == 1.0, (
                 "A non 1.0 q_scale is not currently supported."
             )
-            if current_platform.is_cuda():
-                # Skip Q quantization on ROCm and XPU, enable this on cuda
-                # only, since dequantizing back to f32 in the attention kernel
-                # is not supported.
-                query, _ = ops.scaled_fp8_quant(
-                    query.reshape((num_tokens, num_heads * head_size)).contiguous(),
-                    layer._q_scale,
-                )
-                query = query.reshape((num_tokens, num_heads, head_size))
 
         cu_seqlens_q = attn_metadata.query_start_loc
         seqused_k = attn_metadata.seq_lens

From 938c43ea7fa72cf882403b97b3ea3884308a7e0f Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Wed, 15 Oct 2025 18:52:13 -0500
Subject: [PATCH 51/51] [ci] Adjusting AMD test composition 2025-10-14 (#26852)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
---
 .buildkite/test-amd.yaml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 91f0b850575c4..50b2b61124af0 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -63,7 +63,7 @@ steps:
 
 - label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
   timeout_in_minutes: 10
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
@@ -353,7 +353,7 @@ steps:
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
 - label: V1 Test others (CPU) # 5 mins
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
@@ -459,6 +459,7 @@ steps:
     - pytest -v -s compile/test_fusion_all_reduce.py
     - pytest -v -s compile/test_decorator.py
     - pytest -v -s compile/test_noop_elimination.py
+    - pytest -v -s compile/test_aot_compile.py
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -487,14 +488,14 @@ steps:
 
 - label: Kernels Core Operation Test # 48min
   timeout_in_minutes: 75
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
   - csrc/
   - tests/kernels/core
   commands:
-    - pytest -v -s kernels/core
+    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 
 - label: Kernels Attention Test %N # 23min
   timeout_in_minutes: 35
@@ -632,7 +633,7 @@ steps:
 
 - label: OpenAI-Compatible Tool Use # 23 min
   timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   fast_check: false