From 3127274d022b0bc8ff6ba9ceef41a99a6f01ad2d Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Wed, 17 Sep 2025 21:04:21 -0700
Subject: [PATCH 01/58] [MM Encoder] Apply DP ViT for Qwen3-VL model series
 (#24955)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/qwen3_vl.py     | 94 +++++++++++++++++-----
 vllm/model_executor/models/qwen3_vl_moe.py |  2 +
 2 files changed, 77 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 22948aee4936c..2c36dfbce7f67 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -126,20 +126,23 @@ class Qwen3_VisionMLP(nn.Module):
                  bias: bool = False,
                  act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         self.linear_fc1 = ColumnParallelLinear(in_features,
                                                hidden_features,
                                                bias=bias,
                                                quant_config=quant_config,
                                                return_bias=False,
-                                               prefix=f"{prefix}.linear_fc1")
+                                               prefix=f"{prefix}.linear_fc1",
+                                               disable_tp=use_data_parallel)
         self.linear_fc2 = RowParallelLinear(hidden_features,
                                             in_features,
                                             bias=bias,
                                             quant_config=quant_config,
                                             return_bias=False,
-                                            prefix=f"{prefix}.linear_fc2")
+                                            prefix=f"{prefix}.linear_fc2",
+                                            disable_tp=use_data_parallel)
         self.act_fn = act_fn
 
     def forward(self, x: torch.Tensor):
@@ -158,23 +161,27 @@ class Qwen3_VisionBlock(nn.Module):
         norm_layer: Optional[Callable[[int], nn.Module]] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         if norm_layer is None:
             norm_layer = partial(nn.LayerNorm, eps=1e-6)
         self.norm1 = norm_layer(dim)
         self.norm2 = norm_layer(dim)
-        self.attn = Qwen2_5_VisionAttention(embed_dim=dim,
-                                            num_heads=num_heads,
-                                            projection_size=dim,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.attn")
+        self.attn = Qwen2_5_VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_data_parallel=use_data_parallel)
         self.mlp = Qwen3_VisionMLP(dim,
                                    mlp_hidden_dim,
                                    act_fn=act_fn,
                                    bias=True,
                                    quant_config=quant_config,
-                                   prefix=f"{prefix}.mlp")
+                                   prefix=f"{prefix}.mlp",
+                                   use_data_parallel=use_data_parallel)
 
     def forward(
             self,
@@ -205,6 +212,7 @@ class Qwen3_VisionPatchMerger(nn.Module):
         use_postshuffle_norm: bool = False,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = context_dim * (spatial_merge_size**2)
@@ -222,13 +230,15 @@ class Qwen3_VisionPatchMerger(nn.Module):
                                                self.hidden_size,
                                                bias=True,
                                                quant_config=quant_config,
-                                               prefix=f"{prefix}.linear_fc1")
+                                               prefix=f"{prefix}.linear_fc1",
+                                               disable_tp=use_data_parallel)
         self.act_fn = nn.GELU()
         self.linear_fc2 = RowParallelLinear(self.hidden_size,
                                             d_model,
                                             bias=True,
                                             quant_config=quant_config,
-                                            prefix=f"{prefix}.linear_fc2")
+                                            prefix=f"{prefix}.linear_fc2",
+                                            disable_tp=use_data_parallel)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.use_postshuffle_norm:
@@ -250,6 +260,7 @@ class Qwen3_VisionTransformer(nn.Module):
         norm_eps: float = 1e-6,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = vision_config.hidden_size
@@ -260,6 +271,12 @@ class Qwen3_VisionTransformer(nn.Module):
         self.spatial_merge_unit = self.spatial_merge_size**2
         self.temporal_patch_size = vision_config.temporal_patch_size
         self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes
+        self.use_data_parallel = use_data_parallel
+
+        # NOTE: This is used for creating empty tensor for all_gather for
+        # DP ViT. Here out_hidden_size is enlarged due to deepstack
+        self.out_hidden_size = (vision_config.out_hidden_size *
+                                (1 + len(self.deepstack_visual_indexes)))
 
         self.patch_embed = Qwen3_VisionPatchEmbed(
             patch_size=self.patch_size,
@@ -283,7 +300,8 @@ class Qwen3_VisionTransformer(nn.Module):
                 act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
                 norm_layer=norm_layer,
                 quant_config=quant_config,
-                prefix=f"{prefix}.blocks.{layer_idx}")
+                prefix=f"{prefix}.blocks.{layer_idx}",
+                use_data_parallel=use_data_parallel)
             for layer_idx in range(vision_config.depth)
         ])
 
@@ -294,6 +312,7 @@ class Qwen3_VisionTransformer(nn.Module):
             spatial_merge_size=self.spatial_merge_size,
             quant_config=quant_config,
             prefix=f"{prefix}.merger",
+            use_data_parallel=use_data_parallel,
         )
 
         self.deepstack_merger_list = nn.ModuleList([
@@ -304,7 +323,8 @@ class Qwen3_VisionTransformer(nn.Module):
                 use_postshuffle_norm=True,
                 norm_layer=norm_layer,
                 quant_config=quant_config,
-                prefix=f"{prefix}.deepstack_merger_list.{layer_idx}")
+                prefix=f"{prefix}.deepstack_merger_list.{layer_idx}",
+                use_data_parallel=use_data_parallel)
             for layer_idx in range(len(self.deepstack_visual_indexes))
         ])
 
@@ -325,7 +345,14 @@ class Qwen3_VisionTransformer(nn.Module):
 
     def rot_pos_emb(self, grid_thw):
         pos_ids = []
-        for t, h, w in grid_thw:
+        # Support both Tensor and list inputs for DP path
+        if isinstance(grid_thw, list):
+            grid_list = grid_thw
+            max_grid_size = max(max(h, w) for _, h, w in grid_list)
+        else:
+            grid_list = grid_thw.tolist()
+            max_grid_size = int(grid_thw[:, 1:].max().item())
+        for t, h, w in grid_list:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
             hpos_ids = hpos_ids.reshape(
                 h // self.spatial_merge_size,
@@ -348,7 +375,6 @@ class Qwen3_VisionTransformer(nn.Module):
             pos_ids.append(
                 torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
-        max_grid_size = grid_thw[:, 1:].max()
         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
@@ -453,10 +479,18 @@ class Qwen3_VisionTransformer(nn.Module):
         hidden_states = hidden_states + pos_embeds
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
 
+        if isinstance(grid_thw, list):
+            grid_thw_tensor = torch.tensor(grid_thw,
+                                           device=hidden_states.device,
+                                           dtype=torch.int32)
+        else:
+            grid_thw_tensor = grid_thw
+
         cu_seqlens = torch.repeat_interleave(
-            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2],
+            grid_thw_tensor[:, 0]).cumsum(
                 dim=0,
-                dtype=grid_thw.dtype
+                dtype=grid_thw_tensor.dtype
                 if torch.jit.is_tracing() else torch.int32,
             )
         cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
@@ -984,6 +1018,9 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "up_proj",
         ],
     }
+
+    supports_encoder_tp_data = True
+
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -1009,12 +1046,14 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
 
         self.config = config
         self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
         self.visual = Qwen3_VisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-6),
             quant_config=self._maybe_ignore_quant_config(quant_config),
             prefix=maybe_prefix(prefix, "visual"),
+            use_data_parallel=self.use_data_parallel,
         )
 
         self.language_model = Qwen3LLMForCausalLM(vllm_config=vllm_config,
@@ -1177,7 +1216,15 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+            if self.use_data_parallel:
+                from vllm.multimodal.utils import (
+                    run_dp_sharded_mrope_vision_model)
+                return run_dp_sharded_mrope_vision_model(self.visual,
+                                                         pixel_values,
+                                                         grid_thw_list,
+                                                         rope_type="rope_3d")
+            else:
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each image item.
         # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
@@ -1199,7 +1246,16 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         else:
             pixel_values_videos = video_input["pixel_values_videos"].type(
                 self.visual.dtype)
-            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+            if self.use_data_parallel:
+                from vllm.multimodal.utils import (
+                    run_dp_sharded_mrope_vision_model)
+                return run_dp_sharded_mrope_vision_model(self.visual,
+                                                         pixel_values_videos,
+                                                         grid_thw_list,
+                                                         rope_type="rope_3d")
+            else:
+                video_embeds = self.visual(pixel_values_videos,
+                                           grid_thw=grid_thw)
 
         # Split concatenated embeddings for each video item.
         # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index a800e94ab1e50..d25bc71dcb59b 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -315,12 +315,14 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
 
         self.config = config
         self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
         self.visual = Qwen3_VisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-6),
             quant_config=self._maybe_ignore_quant_config(quant_config),
             prefix=maybe_prefix(prefix, "visual"),
+            use_data_parallel=self.use_data_parallel,
         )
 
         self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config,

From 32baf1d03685ead1f5946f867e4ca16007bd10b5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 18 Sep 2025 05:05:18 +0100
Subject: [PATCH 02/58] [Docs] Clean up the contributing README (#25099)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/contributing/README.md                   | 177 +++++++++---------
 .../installation/python_env_setup.inc.md      |   2 +-
 mkdocs.yaml                                   |   1 +
 3 files changed, 95 insertions(+), 85 deletions(-)

diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index 5a2a70d57e85f..b0a95b3b3d3a5 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -26,113 +26,123 @@ See <gh-file:LICENSE>.
 
 ## Developing
 
---8<-- "docs/getting_started/installation/python_env_setup.inc.md"
-
-Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
-Check out the [building from source][build-from-source] documentation for details.
-
-For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
-
-### Building the docs with MkDocs
-
-#### Introduction to MkDocs
-
-[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
-
-#### Install MkDocs and Plugins
-
-Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
-
-```bash
-uv pip install -r requirements/docs.txt
-```
-
-!!! note
-    Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
-
-#### Verify Installation
-
-Confirm that MkDocs is correctly installed:
-
-```bash
-mkdocs --version
-```
-
-Example output:
-
-```console
-mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
-```
-
-#### Clone the `vLLM` repository
+The first step of contributing to vLLM is to clone the GitHub repository:
 
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 ```
 
-#### Start the Development Server
+Then, configure your Python virtual environment.
 
-MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
+
+If you are only developing vLLM's Python code, install vLLM using:
 
 ```bash
-mkdocs serve
+VLLM_USE_PRECOMPILED=1 uv pip install -e .
 ```
 
-Example output:
+If you are developing vLLM's Python and CUDA/C++ code, install vLLM using:
 
-```console
-INFO    -  Documentation built in 106.83 seconds
-INFO    -  [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
-INFO    -  [22:02:02] Serving on http://127.0.0.1:8000/
+```bash
+uv pip install -e .
 ```
 
-#### View in Your Browser
+For more details about installing from source and installing for other hardware, check out the [installation instructions](../getting_started/installation/README.md) for your hardware and head to the "Build wheel from source" section.
 
-Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
-
-#### Learn More
-
-For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
-
-## Testing
-
-??? console "Commands"
-
-    ```bash
-    # These commands are only for Nvidia CUDA platforms.
-    uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
-
-    # Linting, formatting and static type checking
-    pre-commit install
-
-    # You can manually run pre-commit with
-    pre-commit run --all-files --show-diff-on-failure
-
-    # To manually run something from CI that does not run
-    # locally by default, you can run:
-    pre-commit run mypy-3.9 --hook-stage manual --all-files
-
-    # Unit tests
-    pytest tests/
-
-    # Run tests for a single test file with detailed output
-    pytest -s -v tests/test_logger.py
-    ```
+For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
 
 !!! tip
-    Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
+    vLLM is compatible with Python versions 3.9 to 3.12. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12.
 
     Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
 
-!!! note "Install python3-dev if Python.h is missing"
+### Linting
+
+vLLM uses `pre-commit` to lint and format the codebase. See <https://pre-commit.com/#usage> if `pre-commit` is new to you. Setting up `pre-commit` is as easy as:
+
+```bash
+uv pip install pre-commit
+pre-commit install
+```
+
+vLLM's `pre-commit` hooks will now run automatically every time you commit.
+
+!!! tip "Tips"
+    You can manually run the `pre-commit` hooks using:
+
+    ```bash
+    pre-commit run     # runs on staged files
+    pre-commit run -a  # runs on all files (short for --all-files)
+    ```
+
+    ---
+
+    Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with:
+
+    ```bash
+    pre-commit run --hook-stage manual markdownlint
+    pre-commit run --hook-stage manual mypy-3.9
+    ```
+
+### Documentation
+
+MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, <gh-file:mkdocs.yaml>.
+
+Get started with:
+
+```bash
+uv pip install -r requirements/docs.txt
+```
+
+!!! tip
+    Ensure that your Python version is compatible with the plugins
+    (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
+
+MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it.
+From the root of the repository, run:
+
+```bash
+mkdocs serve                           # with API ref (~10 minutes)
+API_AUTONAV_EXCLUDE=vllm mkdocs serve  # API ref off (~15 seconds)
+```
+
+Once you see `Serving on http://127.0.0.1:8000/` in the logs, the live preview is ready!
+Open <http://127.0.0.1:8000/> in your browser to see it.
+
+For additional features and advanced configurations, refer to the:
+
+- [MkDocs documentation](https://www.mkdocs.org/)
+- [Material for MkDocs documentation](https://squidfunk.github.io/mkdocs-material/) (the MkDocs theme we use)
+
+### Testing
+
+vLLM uses `pytest` to test the codebase.
+
+```bash
+# Install the test dependencies used in CI (CUDA only)
+uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
+
+# Install some common test dependencies (hardware agnostic)
+uv pip install pytest pytest-asyncio
+
+# Run all tests
+pytest tests/
+
+# Run tests for a single test file with detailed output
+pytest -s -v tests/test_logger.py
+```
+
+!!! tip "Install python3-dev if Python.h is missing"
     If any of the above commands fails with `Python.h: No such file or directory`, install
     `python3-dev` with `sudo apt install python3-dev`.
 
-!!! note
+!!! warning "Warnings"
     Currently, the repository is not fully checked by `mypy`.
 
-!!! note
+    ---
+
     Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
     platform to run unit tests locally, rely on the continuous integration system to run the tests for
     now.
@@ -194,8 +204,7 @@ appropriately to indicate the type of change. Please use one of the following:
 The PR needs to meet the following code quality standards:
 
 - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
-- Pass all linter checks. Please use `pre-commit` to format your code. See
-  <https://pre-commit.com/#usage> if `pre-commit` is new to you.
+- Pass all linter checks.
 - The code needs to be well-documented to ensure future contributors can easily
   understand the code.
 - Include sufficient tests to ensure the project stays correct and robust. This
diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md
index 423bf9b00d07f..06794f8d3120e 100644
--- a/docs/getting_started/installation/python_env_setup.inc.md
+++ b/docs/getting_started/installation/python_env_setup.inc.md
@@ -1,4 +1,4 @@
-It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands:
 
 ```bash
 uv venv --python 3.12 --seed
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 507a80c41e8b4..bbd850bdfee34 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -79,6 +79,7 @@ plugins:
         - "re:vllm\\._.*"  # Internal modules
         - "vllm.third_party"
         - "vllm.vllm_flash_attn"
+        - !ENV [API_AUTONAV_EXCLUDE, ""]
   - mkdocstrings:
       handlers:
         python:

From b98219670fb1ca2952d449404c2b4921d7cdce73 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Thu, 18 Sep 2025 05:08:41 +0100
Subject: [PATCH 03/58] [Core][MM] Cleanup `MultiModalCache` (#25006)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/multimodal/cache.py | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 31ae450f4c2ff..297b4c7fa7fbd 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import operator
 import sys
 from abc import ABC, abstractmethod
 from collections.abc import Mapping, Sequence
@@ -91,26 +92,15 @@ _V = TypeVar("_V", bound=MultiModalCacheValue)
 class MultiModalCache:
 
     @classmethod
-    def get_leaf_size(
-        cls,
-        leaf: object,
-        *,
-        debug: bool = False,
-    ) -> int:
+    def get_leaf_size(cls, leaf: object) -> int:
         if isinstance(leaf, MultiModalProcessorCacheItem):
             return cls.get_leaf_size(leaf.item)
         if isinstance(leaf, MultiModalProcessorCacheItemMetadata):
             return leaf.item_size
 
         # These are not subclasses of dict
-        if isinstance(leaf, MultiModalKwargsItems):
-            return cls.get_item_size(leaf.data)  # type: ignore
-        if isinstance(leaf, MultiModalKwargsItem):
-            return cls.get_item_size(leaf.data)  # type: ignore
-        if isinstance(leaf, MultiModalKwargs):
-            return cls.get_item_size(leaf.data)  # type: ignore
-
-        if isinstance(leaf, MultiModalFieldElem):
+        if isinstance(leaf, (MultiModalKwargs, MultiModalKwargsItems,
+                             MultiModalKwargsItem, MultiModalFieldElem)):
             return cls.get_item_size(leaf.data)  # type: ignore
 
         # sys.getsizeof doesn't work for tensors
@@ -126,11 +116,8 @@ class MultiModalCache:
         *,
         debug: bool = False,
     ) -> int:
-        size = json_reduce_leaves(
-            lambda a, b: a + b,
-            json_map_leaves(lambda x: cls.get_leaf_size(x, debug=debug),
-                            value),
-        )
+        size = json_reduce_leaves(operator.add,
+                                  json_map_leaves(cls.get_leaf_size, value))
 
         if debug:
             leaf_count = json_count_leaves(value)

From 027d37df389b00ed2e7d874113f869267533a2ab Mon Sep 17 00:00:00 2001
From: toncao <130689535+toncao@users.noreply.github.com>
Date: Thu, 18 Sep 2025 11:08:50 +0700
Subject: [PATCH 04/58] [Bugfix][Qwen3-Next] add prefixes to shared_expert in
 qwen3-next and mlp in qwen2moe to successfully load ignored params in
 quantized models (#24960)

Signed-off-by: toncao <cpatonn@gmail.com>
Co-authored-by: toncao <cpatonn@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/qwen2_moe.py  | 48 ++++++++++++------------
 vllm/model_executor/models/qwen3_next.py |  1 +
 2 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 5e6dea67c9404..6c6276a930453 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -72,17 +72,20 @@ class Qwen2MoeMLP(nn.Module):
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         reduce_results: bool = True,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
             hidden_size, [intermediate_size] * 2,
             bias=False,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj")
         self.down_proj = RowParallelLinear(intermediate_size,
                                            hidden_size,
                                            bias=False,
                                            quant_config=quant_config,
-                                           reduce_results=reduce_results)
+                                           reduce_results=reduce_results,
+                                           prefix=f"{prefix}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -123,7 +126,8 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.num_experts,
                                      bias=False,
-                                     quant_config=None)
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
         if config.shared_expert_intermediate_size > 0:
             self.shared_expert = Qwen2MoeMLP(
                 hidden_size=config.hidden_size,
@@ -132,6 +136,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
                 quant_config=quant_config,
                 reduce_results=self.experts.must_reduce_shared_expert_outputs(
                 ),
+                prefix=f"{prefix}.shared_expert",
             )
         else:
             self.shared_expert = None
@@ -203,21 +208,19 @@ class Qwen2MoeAttention(nn.Module):
         self.max_position_embeddings = max_position_embeddings
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=True,
-            quant_config=quant_config,
-        )
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=True,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
 
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            quant_config=quant_config,
-        )
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
 
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -296,12 +299,11 @@ class Qwen2MoeDecoderLayer(nn.Module):
                                               quant_config=quant_config,
                                               prefix=f"{prefix}.mlp")
         else:
-            self.mlp = Qwen2MoeMLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=config.intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-            )
+            self.mlp = Qwen2MoeMLP(hidden_size=config.hidden_size,
+                                   intermediate_size=config.intermediate_size,
+                                   hidden_act=config.hidden_act,
+                                   quant_config=quant_config,
+                                   prefix=f"{prefix}.mlp")
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index fe63e93032352..ca9f4d402dac2 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -138,6 +138,7 @@ class Qwen3NextSparseMoeBlock(nn.Module):
                 quant_config=quant_config,
                 reduce_results=self.experts.must_reduce_shared_expert_outputs(
                 ),
+                prefix=f"{prefix}.shared_expert",
             )
         else:
             self.shared_expert = None

From dc2979c58574e7a49d17b50c5770010039145aac Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Thu, 18 Sep 2025 00:10:21 -0400
Subject: [PATCH 05/58] [Kernels] Overlap shared experts with combine instead
 of dispatch (#24254)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 .../fused_moe/deepep_ht_prepare_finalize.py   | 50 +++++++++-
 .../fused_moe/deepep_ll_prepare_finalize.py   | 55 +++++++++--
 .../layers/fused_moe/modular_kernel.py        | 95 +++++++++++++++----
 .../layers/fused_moe/pplx_prepare_finalize.py | 39 +++++++-
 4 files changed, 203 insertions(+), 36 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 5d6b9c87a6b76..f390f0a25875e 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -240,7 +240,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                                            quant_config)
         return receiver()
 
-    def finalize(
+    def _finalize(
         self,
         output: torch.Tensor,
         fused_expert_output: torch.Tensor,
@@ -248,7 +248,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
-    ) -> None:
+        do_async: bool,
+    ) -> Optional[Callable]:
 
         assert self.handle is not None
 
@@ -271,7 +272,46 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             topk_weights=None,
             config=self._get_combine_config(),
             previous_event=None,
-            async_finish=False,
+            async_finish=do_async,
             allocate_on_comm_stream=False)
-        # Respect inplace outputs.
-        output.copy_(combined_x, non_blocking=True)
+
+        if do_async:
+
+            def _receiver():
+                event.current_stream_wait()
+                # Respect inplace outputs.
+                output.copy_(combined_x, non_blocking=True)
+
+            return lambda: _receiver()
+        else:
+            # Respect inplace outputs.
+            output.copy_(combined_x, non_blocking=True)
+            return None
+
+    def finalize_async(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> Callable:
+        receiver = self._finalize(output, fused_expert_output, topk_weights,
+                                  topk_ids, apply_router_weight_on_input,
+                                  weight_and_reduce_impl, True)
+        assert receiver is not None
+        return receiver
+
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
+        self._finalize(output, fused_expert_output, topk_weights, topk_ids,
+                       apply_router_weight_on_input, weight_and_reduce_impl,
+                       False)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 01df7770463d0..101fc8798c427 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -12,8 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input, normalize_batched_scales_shape)
 from vllm.v1.worker.ubatching import (dbo_current_ubatch_id, dbo_enabled,
-                                      dbo_maybe_run_recv_hook,
-                                      dbo_register_recv_hook, dbo_yield)
+                                      dbo_maybe_run_recv_hook)
 
 # DeepEP kernels quantize dispatch inputs in 128 element chunks.
 DEEPEP_QUANT_BLOCK_SIZE = 128
@@ -198,7 +197,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         hook()
         return receiver()
 
-    def finalize(
+    def _finalize(
         self,
         output: torch.Tensor,
         fused_expert_output: torch.Tensor,
@@ -206,13 +205,14 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
-    ) -> None:
+        do_async: bool,
+    ) -> Optional[Callable]:
         assert isinstance(
             weight_and_reduce_impl, TopKWeightAndReduceDelegate
         ), ("Weight application and reduction happens in the combine kernel.")
 
         a2a_idx = dbo_current_ubatch_id()
-        do_recv_hook = dbo_enabled()
+        do_recv_hook = dbo_enabled() or do_async
         handle = self.handles[a2a_idx]
         assert handle is not None
 
@@ -232,6 +232,45 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             zero_copy=False,
             return_recv_hook=do_recv_hook,
             out=output)
-        if recv_hook is not None:
-            dbo_register_recv_hook(recv_hook)
-        dbo_yield()
+
+        return recv_hook
+
+    def finalize_async(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> Callable:
+        recv_hook = self._finalize(
+            output,
+            fused_expert_output,
+            topk_weights,
+            topk_ids,
+            apply_router_weight_on_input,
+            weight_and_reduce_impl,
+            do_async=True,
+        )
+        assert recv_hook is not None
+        return recv_hook
+
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
+        self._finalize(
+            output,
+            fused_expert_output,
+            topk_weights,
+            topk_ids,
+            apply_router_weight_on_input,
+            weight_and_reduce_impl,
+            do_async=False,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 58cd0294c8c44..729f8e39cf0f7 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -209,7 +209,8 @@ class FusedMoEPrepareAndFinalize(ABC):
 
     def supports_async(self) -> bool:
         """
-        Indicates whether or not this class implements prepare_async.
+        Indicates whether or not this class implements prepare_async and
+        finalize_async.
         """
         return False
 
@@ -275,6 +276,42 @@ class FusedMoEPrepareAndFinalize(ABC):
         """
         raise NotImplementedError
 
+    def finalize_async(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: TopKWeightAndReduce,
+    ) -> Callable:
+        """
+        Perform any combine plus apply weights and perform a reduction on the
+        fused experts output but do not wait for results from other workers.
+        - output: The output tensor, written in place.  Must be (M, K) shape.
+        - fused_expert_output: The unweighted, unreduced output of the fused
+          experts, it will have (M, topk, K) shape.
+        - topk_weights: The weights to be applied to the fused_experts_output.
+        - topk_ids: The topk_ids.
+        - apply_router_weight_on_input: When False, apply the weights to
+          fused_expert_output.
+        - weight_and_reduce_impl: An optional TopKWeightAndReduce
+          implementation.
+
+        Returns a callback that when invoked waits for results from other
+        workers and has the same return signature as `finalize`, e.g.
+
+        receiver = obj.finalize_async(output, ...)
+        ... output not valid yet ...
+        receiver()
+        ... output valid here ...
+
+        is equivalent to:
+
+        obj.finalize(output, ...)
+        """
+        raise NotImplementedError
+
     @property
     @abstractmethod
     def activation_format(self) -> FusedMoEActivationFormat:
@@ -814,23 +851,20 @@ class FusedMoEModularKernel(torch.nn.Module):
         """
 
         a1 = hidden_states
-        output = a1 if inplace else torch.zeros_like(a1)
+        if inplace and self.shared_experts is None:
+            output = a1
+        else:
+            output = torch.zeros_like(a1)
 
         local_num_experts = w1.size(0)
         if global_num_experts == -1:
             global_num_experts = local_num_experts
 
-        shared_output: torch.Tensor
-
         if not self.prepare_finalize.supports_async():
             # We shouldn't be running an a2a kernel that doesn't
             # support async prepare/finalize
             assert not dbo_enabled()
 
-            # Run shared experts serially with dispatch.
-            if self.shared_experts is not None:
-                shared_output = self.shared_experts(a1)
-
             (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
              _expert_topk_weights) = self.prepare_finalize.prepare(
                  a1,
@@ -854,9 +888,6 @@ class FusedMoEModularKernel(torch.nn.Module):
                 self.fused_experts.quant_config,
             )
 
-            if self.shared_experts is not None:
-                shared_output = self.shared_experts(a1)
-
             # If DBO is being used, register the hook with the ubatch context
             # and call it in dbo_maybe_run_recv_hook instead of passing it to
             # the receiver.
@@ -900,16 +931,42 @@ class FusedMoEModularKernel(torch.nn.Module):
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
-        self.prepare_finalize.finalize(
-            output,
-            fused_out,
-            topk_weights,
-            topk_ids,
-            apply_router_weight_on_input,
-            self.fused_experts.finalize_weight_and_reduce_impl(),
-        )
+        shared_output: Optional[torch.Tensor] = None
+
+        if not self.prepare_finalize.supports_async():
+            assert not dbo_enabled()
+
+            self.prepare_finalize.finalize(
+                output,
+                fused_out,
+                topk_weights,
+                topk_ids,
+                apply_router_weight_on_input,
+                self.fused_experts.finalize_weight_and_reduce_impl(),
+            )
+            if self.shared_experts is not None:
+                shared_output = self.shared_experts(a1)
+        else:
+            recv_hook = self.prepare_finalize.finalize_async(
+                output,
+                fused_out,
+                topk_weights,
+                topk_ids,
+                apply_router_weight_on_input,
+                self.fused_experts.finalize_weight_and_reduce_impl(),
+            )
+
+            if self.shared_experts is not None:
+                shared_output = self.shared_experts(a1)
+
+            assert recv_hook is not None
+            dbo_register_recv_hook(recv_hook)
+            dbo_yield()
+            if not dbo_enabled():
+                recv_hook()
 
         if self.shared_experts is None:
             return output
         else:
+            assert shared_output is not None
             return shared_output, output
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 32d12476dd01a..ddddd2a3b7a2e 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -272,7 +272,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         hook()
         return receiver()
 
-    def finalize(
+    def finalize_async(
         self,
         output: torch.Tensor,
         fused_expert_output: torch.Tensor,
@@ -280,7 +280,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
-    ) -> None:
+    ) -> Callable:
         assert isinstance(
             weight_and_reduce_impl, TopKWeightAndReduceDelegate
         ), ("Weight application and reduction happens in the combine kernel.")
@@ -303,8 +303,39 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         if apply_router_weight_on_input:
             topk_weights = torch.ones_like(topk_weights)
 
+        topk_ids_u32 = topk_ids.view(dtype=torch.uint32)
+
         self.a2a.combine(out_tokens=output,
-                         indices=topk_ids.view(dtype=torch.uint32),
+                         indices=topk_ids_u32,
                          weights=topk_weights,
                          expert_y=fused_expert_output,
-                         bound_m=bound_m)
+                         bound_m=bound_m,
+                         do_send=True,
+                         do_recv=False)
+
+        return lambda: self.a2a.combine(out_tokens=output,
+                                        indices=topk_ids_u32,
+                                        weights=topk_weights,
+                                        expert_y=fused_expert_output,
+                                        bound_m=bound_m,
+                                        do_send=False,
+                                        do_recv=True)
+
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
+        receiver = self.finalize_async(
+            output,
+            fused_expert_output,
+            topk_weights,
+            topk_ids,
+            apply_router_weight_on_input,
+            weight_and_reduce_impl,
+        )
+        receiver()

From 52bc9d5b3edbf8804758d46cde28024d6c362e42 Mon Sep 17 00:00:00 2001
From: YiwenC <54658925+666even666@users.noreply.github.com>
Date: Wed, 17 Sep 2025 21:11:46 -0700
Subject: [PATCH 06/58] [Model] enable data parallel for InternVL vision
 encoder (#23909)

Signed-off-by: Yiwen Chen <yiwen66@berkeley.edu>
Signed-off-by: YiwenC <54658925+666even666@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 docs/configuration/optimization.md       |   1 +
 vllm/model_executor/models/intern_vit.py | 107 ++++++++++++++++-------
 vllm/model_executor/models/internvl.py   |   5 +-
 3 files changed, 80 insertions(+), 33 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 5807d787cf531..5564d8a81d937 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -175,6 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 Known supported models:
 
 - GLM-4.5V GLM-4.1V (<gh-pr:23168>)
+- InternVL (<gh-pr:23909>)
 - Kimi-VL (<gh-pr:23817>)
 - Llama4 (<gh-pr:18368>)
 - MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 8e9ab9649bd44..118cce810a1f2 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -25,9 +25,11 @@ from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.multimodal.utils import run_dp_sharded_vision_model
 
 NORM2FN = {
     'rms_norm': RMSNorm,
@@ -137,6 +139,7 @@ class InternParallelAttention(nn.Module):
         *,
         num_dummy_heads: int = 0,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
@@ -150,8 +153,10 @@ class InternParallelAttention(nn.Module):
                 f'(got `embed_dim`: {self.embed_dim} and `num_heads`:'
                 f' {self.num_heads}).')
 
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = (1 if use_data_parallel else
+                        get_tensor_model_parallel_world_size())
+        self.tp_rank = (0 if use_data_parallel else
+                        get_tensor_model_parallel_rank())
 
         # Additional dummy heads are used to enable TP for common GPU counts.
         self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim
@@ -159,14 +164,23 @@ class InternParallelAttention(nn.Module):
                                               self.tp_size)
 
         self.scale = self.head_dim**-0.5
-        self.qkv = QKVParallelLinear(
-            self.embed_dim,
-            self.head_dim,
-            num_dummy_heads + self.num_heads,
-            bias=config.qkv_bias,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv",
-        )
+        if use_data_parallel:
+            self.qkv = ReplicatedLinear(
+                self.embed_dim,
+                3 * self.head_dim * self.num_heads,
+                bias=config.qkv_bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv",
+            )
+        else:
+            self.qkv = QKVParallelLinear(
+                self.embed_dim,
+                self.head_dim,
+                num_dummy_heads + self.num_heads,
+                bias=config.qkv_bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv",
+            )
 
         self.qk_normalization = config.qk_normalization
 
@@ -178,12 +192,20 @@ class InternParallelAttention(nn.Module):
                                   eps=config.layer_norm_eps,
                                   var_hidden_size=self.embed_dim)
 
-        self.proj = RowParallelLinear(
-            self.dummy_dim,
-            self.embed_dim,
-            quant_config=quant_config,
-            prefix=f"{prefix}.proj",
-        )
+        if use_data_parallel:
+            self.proj = ReplicatedLinear(
+                self.dummy_dim,
+                self.embed_dim,
+                quant_config=quant_config,
+                prefix=f"{prefix}.proj",
+            )
+        else:
+            self.proj = RowParallelLinear(
+                self.dummy_dim,
+                self.embed_dim,
+                quant_config=quant_config,
+                prefix=f"{prefix}.proj",
+            )
 
         self.attn = MultiHeadAttention(self.num_heads_per_partition,
                                        self.head_dim, self.scale)
@@ -287,21 +309,26 @@ class InternMLP(nn.Module):
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        self.fc1 = ColumnParallelLinear(config.hidden_size,
-                                        config.intermediate_size,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.fc1")
-        self.fc2 = RowParallelLinear(config.intermediate_size,
-                                     config.hidden_size,
-                                     bias=True,
-                                     quant_config=quant_config,
-                                     prefix=f"{prefix}.fc2")
+        cls_fc1 = (ReplicatedLinear
+                   if use_data_parallel else ColumnParallelLinear)
+        self.fc1 = cls_fc1(config.hidden_size,
+                           config.intermediate_size,
+                           bias=True,
+                           quant_config=quant_config,
+                           prefix=f"{prefix}.fc1")
+        cls_fc2 = (ReplicatedLinear
+                   if use_data_parallel else RowParallelLinear)
+        self.fc2 = cls_fc2(config.intermediate_size,
+                           config.hidden_size,
+                           bias=True,
+                           quant_config=quant_config,
+                           prefix=f"{prefix}.fc2")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
@@ -320,6 +347,7 @@ class InternVisionEncoderLayer(nn.Module):
         *,
         num_dummy_heads: int = 0,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
@@ -330,11 +358,13 @@ class InternVisionEncoderLayer(nn.Module):
         self.attn = self._init_attn(config,
                                     quant_config,
                                     num_dummy_heads=num_dummy_heads,
-                                    prefix=f"{prefix}.attn")
+                                    prefix=f"{prefix}.attn",
+                                    use_data_parallel=use_data_parallel)
 
         self.mlp = InternMLP(config,
                              quant_config=quant_config,
-                             prefix=f"{prefix}.mlp")
+                             prefix=f"{prefix}.mlp",
+                             use_data_parallel=use_data_parallel)
         self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
                                              eps=config.layer_norm_eps)
         self.norm2 = NORM2FN[self.norm_type](self.embed_dim,
@@ -352,16 +382,20 @@ class InternVisionEncoderLayer(nn.Module):
         *,
         num_dummy_heads: int,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         # fallback to sdpa attention if tp unavailable
-        tp_size = get_tensor_model_parallel_world_size()
+        # tp_size = get_tensor_model_parallel_world_size()
+        tp_size = (1 if use_data_parallel else
+                   get_tensor_model_parallel_world_size())
         num_heads = config.num_attention_heads
 
         if (num_heads + num_dummy_heads) % tp_size == 0:
             return InternParallelAttention(config,
                                            quant_config=quant_config,
                                            num_dummy_heads=num_dummy_heads,
-                                           prefix=prefix)
+                                           prefix=prefix,
+                                           use_data_parallel=use_data_parallel)
 
         return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads)
 
@@ -388,6 +422,7 @@ class InternVisionEncoder(nn.Module):
         num_hidden_layers_override: Optional[int] = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ):
         super().__init__()
 
@@ -402,7 +437,8 @@ class InternVisionEncoder(nn.Module):
             InternVisionEncoderLayer(config,
                                      quant_config,
                                      num_dummy_heads=num_dummy_heads,
-                                     prefix=f"{prefix}.layers.{layer_idx}")
+                                     prefix=f"{prefix}.layers.{layer_idx}",
+                                     use_data_parallel=use_data_parallel)
             for layer_idx in range(num_hidden_layers)
         ])
 
@@ -429,10 +465,12 @@ class InternVisionModel(nn.Module):
         num_hidden_layers_override: Optional[int] = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
         self.config = config
+        self.use_data_parallel = use_data_parallel
 
         self.embeddings = InternVisionEmbeddings(config)
         self.encoder = InternVisionEncoder(
@@ -441,6 +479,7 @@ class InternVisionModel(nn.Module):
             num_hidden_layers_override=num_hidden_layers_override,
             num_dummy_heads=num_dummy_heads,
             prefix=f"{prefix}.encoder",
+            use_data_parallel=use_data_parallel,
         )
 
     def get_input_embeddings(self):
@@ -464,7 +503,11 @@ class InternVisionModel(nn.Module):
                 raise ValueError(
                     f'wrong pixel_values size: {pixel_values.shape}')
 
-        encoder_outputs = self.encoder(inputs_embeds=hidden_states)
+        if self.use_data_parallel:
+            encoder_outputs = run_dp_sharded_vision_model(
+                hidden_states, self.encoder)
+        else:
+            encoder_outputs = self.encoder(inputs_embeds=hidden_states)
 
         return encoder_outputs
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 9565628b198e2..6a5c565b52e85 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -1035,6 +1035,8 @@ class InternVLMultiModalProcessor(
 class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
                         SupportsLoRA):
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
@@ -1053,6 +1055,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
 
         self.config = config
         self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
         self._patch_quant_config(config, quant_config)
 
         image_size = config.force_image_size or config.vision_config.image_size
@@ -1120,7 +1123,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
                 quant_config=quant_config,
                 num_hidden_layers_override=num_hidden_layers,
                 prefix=prefix,
-            )
+                use_data_parallel=self.use_data_parallel)
         else:
             return InternVisionPatchModel(config.vision_config)
 

From bec060fd99e371b1adc53f65636061f702fa8e61 Mon Sep 17 00:00:00 2001
From: Andrew Sansom <andrew@protopia.ai>
Date: Wed, 17 Sep 2025 23:25:07 -0500
Subject: [PATCH 07/58] Mark prompt logprobs as incompatible with prompt embeds
 at API level (#25077)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
---
 .../test_completion_with_prompt_embeds.py       | 17 +++++++++++++++++
 vllm/engine/llm_engine.py                       | 11 +++++++----
 vllm/entrypoints/openai/serving_completion.py   |  5 +++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index dbfb1b024f7c2..7b58f851a4d21 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -228,3 +228,20 @@ async def test_completions_with_logprobs_and_prompt_embeds(
             assert max(logprobs_arg,
                        1) <= len(top_logprobs) <= logprobs_arg + 1
         assert len(logprobs.tokens) == 5
+
+
+@pytest.mark.asyncio
+async def test_prompt_logprobs_raises_error(
+        client_with_prompt_embeds: openai.AsyncOpenAI):
+    with pytest.raises(BadRequestError, match="not compatible"):
+        encoded_embeds = create_dummy_embeds()
+        await client_with_prompt_embeds.completions.create(
+            model=MODEL_NAME,
+            prompt="",
+            max_tokens=5,
+            temperature=0.0,
+            extra_body={
+                "prompt_embeds": encoded_embeds,
+                "prompt_logprobs": True
+            },
+        )
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index c35bd20371d0a..34b5dcb587503 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -671,10 +671,13 @@ class LLMEngine:
             arrival_time = time.time()
 
         if (isinstance(prompt, dict)
-                and prompt.get("prompt_embeds", None) is not None
-                and not prompt.get("prompt_token_ids", None)):
-            seq_len = prompt["prompt_embeds"].shape[0]
-            prompt["prompt_token_ids"] = [0] * seq_len
+                and prompt.get("prompt_embeds", None) is not None):
+            if not prompt.get("prompt_token_ids", None):
+                seq_len = prompt["prompt_embeds"].shape[0]
+                prompt["prompt_token_ids"] = [0] * seq_len
+            if params.prompt_logprobs is not None:
+                raise ValueError(
+                    "prompt_logprobs is not compatible with prompt embeds.")
 
         processed_inputs = self.input_preprocessor.preprocess(
             prompt,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 044f08f32b0d3..0c61c48da0bc8 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -112,6 +112,11 @@ class OpenAIServingCompletion(OpenAIServing):
             return self.create_error_response(
                 "Echo is unsupported with prompt embeds.")
 
+        if (request.prompt_logprobs is not None
+                and request.prompt_embeds is not None):
+            return self.create_error_response(
+                "prompt_logprobs is not compatible with prompt embeds.")
+
         request_id = (
             f"cmpl-"
             f"{self._base_request_id(raw_request, request.request_id)}")

From 3bc18127ff1c644257abcf84a1a56fab8c0d3f0c Mon Sep 17 00:00:00 2001
From: Chaojun Zhang <chaojun.zhang@intel.com>
Date: Thu, 18 Sep 2025 12:30:10 +0800
Subject: [PATCH 08/58] [XPU] Whisper model support on XPU Platform (#25123)

Signed-off-by: chzhang <chaojun.zhang@intel.com>
---
 vllm/attention/layer.py | 4 ++--
 vllm/v1/worker/utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 22dc6dcbc8d62..15c0ce33e9659 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -391,8 +391,8 @@ class MultiHeadAttention(nn.Module):
             backend = _Backend.FLASH_ATTN
             use_upstream_fa = True
 
-        if current_platform.is_rocm():
-            # currently, only torch_sdpa is supported on rocm
+        if current_platform.is_rocm() or current_platform.is_xpu():
+            # currently, only torch_sdpa is supported on rocm/xpu
             self.attn_backend = _Backend.TORCH_SDPA
         else:
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index fc831a73a75e3..b76ac633892f3 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -282,7 +282,7 @@ def bind_kv_cache(
             # TODO - analyze where runner_kv_caches is used and the right
             # way to ensure it properly reflects multiple attention layers
             # in the same decoder block.
-            if current_platform.is_cuda():
+            if current_platform.is_cuda() or current_platform.is_xpu():
                 # We know that the GPU runner is not impacted by this
                 # case. Some test code depends on runner_kv_caches, but
                 # not in a way that's impacted by ignoring this.

From 9d8a2d86d24b8afd849d18ddb4ef51cec1c0471d Mon Sep 17 00:00:00 2001
From: YiwenC <54658925+666even666@users.noreply.github.com>
Date: Wed, 17 Sep 2025 21:51:35 -0700
Subject: [PATCH 09/58] [EPLB] Add EPLB support for hunyuan_v1 (#23078)

---
 vllm/model_executor/layers/fused_moe/layer.py |   4 +-
 vllm/model_executor/models/hunyuan_v1.py      | 135 ++++++++++++++++--
 2 files changed, 123 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index ae3b67a2b84e6..da513d75da4da 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1508,8 +1508,8 @@ class FusedMoE(CustomOp):
 
         return [
             weight.view(self.local_num_experts, -1) for name, weight in weights
-            if name not in NON_EXPERT_WEIGHTS
-            and not name.startswith("_shared_experts.")
+            if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size(
+                []) and not name.startswith("_shared_experts.")
         ]
 
     def set_eplb_state(
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index db054b5c537e8..4110c8a1fd08d 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -23,7 +23,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only HunYuan model compatible with HuggingFace weights."""
-from collections.abc import Iterable
+import typing
+from collections.abc import Callable, Iterable
 from typing import Any, Optional, Union
 
 import regex as re
@@ -33,8 +34,8 @@ from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_pp_group,
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (get_ep_group, get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -56,7 +57,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_layers, maybe_prefix)
 
@@ -355,10 +356,16 @@ class HunYuanSparseMoeBlock(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         layer_id: int = -1,
         prefix: str = "",
+        enable_eplb: bool = False,
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
 
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.num_experts
+
         if self.tp_size > config.num_experts:
             raise ValueError(
                 f"Tensor parallel size {self.tp_size} is greater than "
@@ -379,8 +386,23 @@ class HunYuanSparseMoeBlock(nn.Module):
                 config.moe_intermediate_size, int) else
                                  config.moe_intermediate_size[layer_id])
 
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+
+        self.n_logical_experts = self.n_routed_experts
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_physical_experts = (self.n_logical_experts +
+                                   self.n_redundant_experts)
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+        self.physical_expert_start = (self.ep_rank *
+                                      self.n_local_physical_experts)
+        self.physical_expert_end = (self.physical_expert_start +
+                                    self.n_local_physical_experts)
+
         self.experts = FusedMoE(
-            num_experts=config.num_experts,
+            num_experts=self.n_routed_experts,
             top_k=top_k,
             hidden_size=config.hidden_size,
             intermediate_size=intermediate_size,
@@ -388,6 +410,8 @@ class HunYuanSparseMoeBlock(nn.Module):
             renormalize=top_k > 1,
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
         )
 
         self.gate = ReplicatedLinear(config.hidden_size,
@@ -446,6 +470,7 @@ class HunYuanDecoderLayer(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         layer_id: int = -1,
+        enable_eplb: bool = False,
     ) -> None:
         super().__init__()
         assert layer_id >= 0
@@ -509,6 +534,7 @@ class HunYuanDecoderLayer(nn.Module):
                 quant_config=quant_config,
                 layer_id=layer_id,
                 prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
             )
         else:
             self.mlp = HunYuanMLP(
@@ -562,6 +588,9 @@ class HunYuanModel(nn.Module):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
+        eplb_config = vllm_config.parallel_config.eplb_config
+        enable_eplb = vllm_config.parallel_config.enable_eplb
+        self.num_redundant_experts = eplb_config.num_redundant_experts
 
         self.config = config
         self.quant_config = quant_config
@@ -588,6 +617,7 @@ class HunYuanModel(nn.Module):
                 cache_config=cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
+                enable_eplb=enable_eplb,
             ),
             prefix=f"{prefix}.layers",
         )
@@ -674,6 +704,7 @@ class HunYuanModel(nn.Module):
                 ckpt_down_proj_name="down_proj",
                 ckpt_up_proj_name="up_proj",
                 num_experts=self.config.num_experts,
+                num_redundant_experts=self.num_redundant_experts,
             )
         else:
             return []
@@ -803,25 +834,43 @@ class HunYuanModel(nn.Module):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                is_expert_weight = False
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
                     if weight_name not in name:
                         continue
-                    name = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
+                    # this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name_mapped, self):
                         continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(Callable[..., bool],
+                                                param.weight_loader)
+                    success = weight_loader(
                         param,
                         loaded_weight,
-                        name,
+                        name_mapped,
                         shard_id=shard_id,
                         expert_id=expert_id,
+                        return_success=True,
                     )
-                    break
+                    if success:
+                        name = name_mapped
+                        break
                 else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
                     # Remapping the name of FP8 kv-scale.
                     name = maybe_remap_kv_scale_name(name, params_dict)
                     if name is None:
@@ -841,7 +890,7 @@ class HunYuanModel(nn.Module):
         return loaded_params
 
 
-class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP):
+class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -883,6 +932,64 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP):
         else:
             self.lm_head = PPMissingLayer()
 
+        # Set MoE hyperparameters
+        self.expert_weights = []
+        self.num_expert_groups = 1
+        self.moe_layers: list[FusedMoE] = []
+        example_layer = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, HunYuanDecoderLayer)
+            if isinstance(layer.mlp, HunYuanSparseMoeBlock):
+                example_layer = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_layer is None:
+            raise RuntimeError("No HunYuanMoE layer found in model.layers.")
+
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_logical_experts = example_layer.n_logical_experts
+        self.num_physical_experts = example_layer.n_physical_experts
+        self.num_local_physical_experts = example_layer.n_local_physical_experts
+        self.num_routed_experts = example_layer.n_routed_experts
+        self.num_redundant_experts = example_layer.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            self.expert_weights.append(layer.get_expert_weights())
+            # Register the expert weights.
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = (num_physical_experts -
+                                      self.num_logical_experts)
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, HunYuanSparseMoeBlock):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
     def forward(
         self,
         input_ids: torch.Tensor,

From 5c65a72bb17b34bc6eb0d7ca43b10938c88dc7e3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 17 Sep 2025 22:05:25 -0700
Subject: [PATCH 10/58] [V0 Deprecation] Remove more V0 tests (#25117)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/test-pipeline.yaml                 |   6 -
 .github/CODEOWNERS                            |   2 -
 tests/async_engine/__init__.py                |   0
 tests/async_engine/api_server_async_engine.py |  54 --
 tests/async_engine/conftest.py                |  12 -
 tests/async_engine/test_api_server.py         | 139 ------
 tests/async_engine/test_request_tracker.py    |  71 ---
 tests/basic_correctness/test_preemption.py    | 189 -------
 tests/detokenizer/conftest.py                 |  11 -
 tests/detokenizer/test_stop_checker.py        |  83 ----
 .../openai/correctness/test_lmeval.py         |  10 -
 tests/samplers/test_logprobs.py               | 182 -------
 tests/worker/__init__.py                      |   0
 tests/worker/conftest.py                      |  11 -
 tests/worker/test_model_input.py              | 113 -----
 tests/worker/test_model_runner.py             | 462 ------------------
 tests/worker/test_profile.py                  |  68 ---
 tests/worker/test_swap.py                     |  87 ----
 18 files changed, 1500 deletions(-)
 delete mode 100644 tests/async_engine/__init__.py
 delete mode 100644 tests/async_engine/api_server_async_engine.py
 delete mode 100644 tests/async_engine/conftest.py
 delete mode 100644 tests/async_engine/test_api_server.py
 delete mode 100644 tests/async_engine/test_request_tracker.py
 delete mode 100644 tests/basic_correctness/test_preemption.py
 delete mode 100644 tests/detokenizer/conftest.py
 delete mode 100644 tests/detokenizer/test_stop_checker.py
 delete mode 100644 tests/samplers/test_logprobs.py
 delete mode 100644 tests/worker/__init__.py
 delete mode 100644 tests/worker/conftest.py
 delete mode 100644 tests/worker/test_model_input.py
 delete mode 100644 tests/worker/test_model_runner.py
 delete mode 100644 tests/worker/test_profile.py
 delete mode 100644 tests/worker/test_swap.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0bce02b90a7cd..8dd99bf1a38f6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -46,22 +46,18 @@ steps:
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
-  - tests/async_engine
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
   - tests/utils_
-  - tests/worker
   - tests/standalone_tests/lazy_imports.py
   - tests/transformers_utils
   commands:
   - python3 standalone_tests/lazy_imports.py
-  - pytest -v -s async_engine # AsyncLLMEngine
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s multimodal
   - pytest -v -s utils_ # Utils
-  - pytest -v -s worker # Worker
   - pytest -v -s transformers_utils # transformers_utils
 
 - label: Python-only Installation Test # 10min
@@ -82,14 +78,12 @@ steps:
   - vllm/
   - tests/basic_correctness/test_basic_correctness
   - tests/basic_correctness/test_cpu_offload
-  - tests/basic_correctness/test_preemption
   - tests/basic_correctness/test_cumem.py
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s basic_correctness/test_cumem.py
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Entrypoints Unit Tests # 5min
   timeout_in_minutes: 10
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 771dd2e172586..b8d6db06548d5 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -41,7 +41,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
@@ -50,7 +49,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
diff --git a/tests/async_engine/__init__.py b/tests/async_engine/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
deleted file mode 100644
index ec6b20f5e04b9..0000000000000
--- a/tests/async_engine/api_server_async_engine.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""vllm.entrypoints.api_server with some extra logging for testing."""
-from collections.abc import Iterable
-from typing import Any
-
-import uvicorn
-from fastapi.responses import JSONResponse, Response
-
-import vllm.entrypoints.api_server
-import vllm.envs as envs
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.utils import FlexibleArgumentParser
-
-app = vllm.entrypoints.api_server.app
-
-
-class AsyncLLMEngineWithStats(AsyncLLMEngine):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._num_aborts = 0
-
-    async def _engine_abort(self, request_ids: Iterable[str]):
-        ids = list(request_ids)
-        self._num_aborts += len(ids)
-        await super()._engine_abort(ids)
-
-    def testing_stats(self) -> dict[str, Any]:
-        return {"num_aborted_requests": self._num_aborts}
-
-
-@app.get("/stats")
-def stats() -> Response:
-    """Get the statistics of the engine."""
-    return JSONResponse(engine.testing_stats())
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8000)
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
-
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
-    vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level="debug",
-                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py
deleted file mode 100644
index 375b248ebedaa..0000000000000
--- a/tests/async_engine/conftest.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
deleted file mode 100644
index 07370a8803291..0000000000000
--- a/tests/async_engine/test_api_server.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import copyreg
-import os
-import subprocess
-import sys
-import time
-from multiprocessing import Pool
-from pathlib import Path
-
-import pytest
-import requests
-import urllib3.exceptions
-
-
-def _pickle_new_connection_error(obj):
-    """Custom pickler for NewConnectionError to fix tblib compatibility."""
-    # Extract the original message by removing the "conn: " prefix
-    full_message = obj.args[0] if obj.args else ""
-    if ': ' in full_message:
-        # Split off the connection part and keep the actual message
-        _, actual_message = full_message.split(': ', 1)
-    else:
-        actual_message = full_message
-    return _unpickle_new_connection_error, (actual_message, )
-
-
-def _unpickle_new_connection_error(message):
-    """Custom unpickler for NewConnectionError."""
-    # Create with None as conn and the actual message
-    return urllib3.exceptions.NewConnectionError(None, message)
-
-
-# Register the custom pickle/unpickle functions for tblib compatibility
-copyreg.pickle(urllib3.exceptions.NewConnectionError,
-               _pickle_new_connection_error)
-
-
-def _query_server(prompt: str, max_tokens: int = 5) -> dict:
-    response = requests.post("http://localhost:8000/generate",
-                             json={
-                                 "prompt": prompt,
-                                 "max_tokens": max_tokens,
-                                 "temperature": 0,
-                                 "ignore_eos": True
-                             })
-    response.raise_for_status()
-    return response.json()
-
-
-def _query_server_long(prompt: str) -> dict:
-    return _query_server(prompt, max_tokens=500)
-
-
-@pytest.fixture
-def api_server(distributed_executor_backend: str):
-    script_path = Path(__file__).parent.joinpath(
-        "api_server_async_engine.py").absolute()
-    commands = [
-        sys.executable,
-        "-u",
-        str(script_path),
-        "--model",
-        "facebook/opt-125m",
-        "--host",
-        "127.0.0.1",
-        "--distributed-executor-backend",
-        distributed_executor_backend,
-    ]
-
-    # API Server Test Requires V0.
-    my_env = os.environ.copy()
-    my_env["VLLM_USE_V1"] = "0"
-    uvicorn_process = subprocess.Popen(commands, env=my_env)
-    yield
-    uvicorn_process.terminate()
-
-
-@pytest.mark.timeout(300)
-@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
-def test_api_server(api_server, distributed_executor_backend: str):
-    """
-    Run the API server and test it.
-
-    We run both the server and requests in separate processes.
-
-    We test that the server can handle incoming requests, including
-    multiple requests at the same time, and that it can handle requests
-    being cancelled without crashing.
-    """
-    with Pool(32) as pool:
-        # Wait until the server is ready
-        prompts = ["warm up"] * 1
-        result = None
-        while not result:
-            try:
-                for r in pool.map(_query_server, prompts):
-                    result = r
-                    break
-            except requests.exceptions.ConnectionError:
-                time.sleep(1)
-
-        # Actual tests start here
-        # Try with 1 prompt
-        for result in pool.map(_query_server, prompts):
-            assert result
-
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
-        assert num_aborted_requests == 0
-
-        # Try with 100 prompts
-        prompts = ["test prompt"] * 100
-        for result in pool.map(_query_server, prompts):
-            assert result
-
-    with Pool(32) as pool:
-        # Cancel requests
-        prompts = ["canceled requests"] * 100
-        pool.map_async(_query_server_long, prompts)
-        time.sleep(0.01)
-        pool.terminate()
-        pool.join()
-
-        # check cancellation stats
-        # give it some time to update the stats
-        time.sleep(1)
-
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
-        assert num_aborted_requests > 0
-
-    # check that server still runs after cancellations
-    with Pool(32) as pool:
-        # Try with 100 prompts
-        prompts = ["test prompt after canceled"] * 100
-        for result in pool.map(_query_server, prompts):
-            assert result
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
deleted file mode 100644
index 1851eeeda7905..0000000000000
--- a/tests/async_engine/test_request_tracker.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.engine.async_llm_engine import RequestTracker
-from vllm.outputs import RequestOutput
-
-
-@pytest.mark.asyncio
-async def test_request_tracker():
-    tracker = RequestTracker()
-    stream_1 = tracker.add_request("1")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert len(new) == 1
-    assert new[0]["request_id"] == "1"
-    assert not aborted
-    assert not stream_1.finished
-
-    stream_2 = tracker.add_request("2")
-    stream_3 = tracker.add_request("3")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert len(new) == 2
-    assert new[0]["request_id"] == "2"
-    assert new[1]["request_id"] == "3"
-    assert not aborted
-    assert not stream_2.finished
-    assert not stream_3.finished
-
-    # request_ids must be unique
-    with pytest.raises(KeyError):
-        tracker.add_request("1")
-    assert not tracker.new_requests_event.is_set()
-
-    tracker.abort_request("1")
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert len(aborted) == 1
-    assert "1" in aborted
-    assert not new
-    assert stream_1.finished
-
-    stream_4 = tracker.add_request("4")
-    tracker.abort_request("4")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    # aborted new requests will cancel each other out -
-    # there's no need for them to propagate into the
-    # engine
-    assert not aborted
-    assert not new
-    assert stream_4.finished
-
-    stream_5 = tracker.add_request("5")
-    assert tracker.new_requests_event.is_set()
-    tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], finished=True))
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert not aborted
-    assert len(new) == 1
-    assert new[0]["request_id"] == "5"
-    assert stream_2.finished
-    assert not stream_5.finished
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
deleted file mode 100644
index db2fa2f6bef6f..0000000000000
--- a/tests/basic_correctness/test_preemption.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the short outputs of HF and vLLM when using greedy sampling.
-
-VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
-
-Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
-pytest tests/basic_correctness/test_preemption.py`.
-"""
-import pytest
-from prometheus_client import REGISTRY
-
-import vllm.envs as envs
-from vllm import SamplingParams
-from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
-                                 ENABLE_ARTIFICIAL_PREEMPT)
-
-from ..models.utils import check_outputs_equal
-
-MODELS = [
-    "distilbert/distilgpt2",
-]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
-    so use VLLM_USE_V1=0 for all tests in the file.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-@pytest.fixture(scope="module", autouse=True)
-def check_settings():
-    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
-        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
-        "pytest tests/basic_correctness/test_preemption.py`")
-
-
-@pytest.fixture
-def distributed_executor_backend() -> str:
-    # When SPMD worker is used, use distributed_executor_backend="ray"
-    # to test delta input optimization works with preemption.
-    return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("chunked_prefill_token_size", [16])
-def test_chunked_prefill_recompute(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    distributed_executor_backend: str,
-) -> None:
-    """Ensure that chunked prefill works with preemption."""
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_batched_tokens = chunked_prefill_token_size
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_seqs=max_num_seqs,
-            distributed_executor_backend=distributed_executor_backend,
-            disable_log_stats=False,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_preemption(
-    caplog_vllm,
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    distributed_executor_backend: str,
-) -> None:
-    """By default, recompute preemption is enabled"""
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            disable_log_stats=False,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-        total_preemption = (
-            vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
-
-    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
-            "is not enough KV cache space." in caplog_vllm.text)
-    # Ensure the count bucket of request-level histogram metrics matches
-    # the number of requests as a simple sanity check to ensure metrics are
-    # generated
-    preemption_metrics = None
-    for m in REGISTRY.collect():
-        if m.name == "vllm:num_preemptions":
-            preemption_metrics = m
-    assert preemption_metrics is not None
-    total_recorded_preemption = 0
-    for sample in preemption_metrics.samples:
-        total_recorded_preemption += sample.value
-    assert total_preemption == total_recorded_preemption
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_preemption_infeasible(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    distributed_executor_backend: str,
-) -> None:
-    """Verify infeasible preemption request will be ignored."""
-    BLOCK_SIZE = 16
-    prefill_blocks = 2
-    decode_blocks = max_tokens // BLOCK_SIZE
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            block_size=BLOCK_SIZE,
-            # Not enough gpu blocks to complete a single sequence.
-            # preemption should happen, and the sequence should be
-            # ignored instead of hanging forever.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
-            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         ignore_eos=True)
-        req_outputs = vllm_model.llm.generate(
-            example_prompts,
-            sampling_params=sampling_params,
-        )
-
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    # Verify the request is ignored and not hang.
-    for req_output in req_outputs:
-        outputs = req_output.outputs
-        assert len(outputs) == 1
-        assert outputs[0].finish_reason == "length"
diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py
deleted file mode 100644
index f2c125355c83c..0000000000000
--- a/tests/detokenizer/conftest.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py
deleted file mode 100644
index 2ca10c072b342..0000000000000
--- a/tests/detokenizer/test_stop_checker.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.inputs import token_inputs
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob, Sequence, SequenceStatus
-
-
-def sequence_with_eos(text: str, eos_token: str,
-                      eos_token_id: int) -> Sequence:
-    """
-    Create a Sequence that ends with an EOS token.
-    """
-    seq = Sequence(
-        seq_id=0,
-        inputs=token_inputs([]),
-        block_size=16,
-        eos_token_id=eos_token_id,
-    )
-    seq.output_text = text + eos_token
-
-    offset = eos_token_id + 1
-    for i in range(offset, len(text) + offset):
-        seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)})
-    seq.append_token_id(token_id=eos_token_id,
-                        logprobs={eos_token_id: Logprob(0.0)})
-
-    seq.status = SequenceStatus.RUNNING
-
-    return seq
-
-
-@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [
-    ("This text ends with EOS token", "</s>", 2),
-])
-@pytest.mark.parametrize("ignore_eos", [True, False])
-@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-@pytest.mark.skip_global_cleanup
-def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
-                           ignore_eos: bool, include_stop_str_in_output: bool):
-    """
-    Test the behavior of the StopChecker's maybe_stop_sequence method
-    when an EOS token is encountered.
-
-    This test covers:
-    - When the EOS token should stop the sequence and be removed from the output
-    - When the EOS token should stop the sequence and be included in the output
-    - When the EOS token should be ignored, and the sequence continues
-    """
-
-    stop_checker = StopChecker(max_model_len=1024)
-
-    seq = sequence_with_eos(
-        text=text_wo_eos,
-        eos_token=eos_token,
-        eos_token_id=eos_token_id,
-    )
-    new_char_count = len(eos_token)
-
-    # Note that `stop` and `stop_token_ids` are not specified
-    sampling_params = SamplingParams(
-        min_tokens=1,
-        ignore_eos=ignore_eos,
-        include_stop_str_in_output=include_stop_str_in_output)
-
-    stop_checker.maybe_stop_sequence(
-        seq=seq,
-        new_char_count=new_char_count,
-        sampling_params=sampling_params,
-    )
-
-    if ignore_eos:
-        assert seq.status == SequenceStatus.RUNNING
-        assert seq.output_text == text_wo_eos + eos_token
-    elif include_stop_str_in_output:
-        assert seq.status == SequenceStatus.FINISHED_STOPPED
-        assert seq.output_text == text_wo_eos + eos_token
-    else:
-        assert seq.status == SequenceStatus.FINISHED_STOPPED
-        assert seq.output_text == text_wo_eos
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index 684407cd6ee97..624acd5ffde73 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -81,13 +81,3 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
             more_args = ["--max-num-seqs", "64"]
 
         run_test(more_args)
-
-
-@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
-                                    more_args):
-    """Run with the V0 Engine."""
-
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        run_test(more_args)
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
deleted file mode 100644
index 87f40b1005312..0000000000000
--- a/tests/samplers/test_logprobs.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm import SamplingParams
-
-from ..conftest import VllmRunner
-
-MODELS = ["distilbert/distilgpt2"]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module is V0 only since it uses dtype=float, so
-    set VLLM_USE_V1=0 for all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype",
-                         ["float"])  # needed for comparing logprobs with HF
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
-@pytest.mark.parametrize("num_top_logprobs", [0, 6])  # 32000 == vocab_size
-@pytest.mark.parametrize("detokenize", [True, False])
-def test_get_prompt_logprobs(
-    hf_runner,
-    vllm_runner,
-    model,
-    dtype,
-    chunked_prefill_token_size: int,
-    num_top_logprobs: int,
-    detokenize: bool,
-    example_prompts,
-):
-    max_num_seqs = 256
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
-        max_num_batched_tokens = chunked_prefill_token_size
-
-    max_tokens = 5
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_logprobs = hf_model.generate_greedy_logprobs(
-            example_prompts,
-            max_tokens=max_tokens,
-        )
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_logprobs=num_top_logprobs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
-                                              logprobs=num_top_logprobs,
-                                              prompt_logprobs=num_top_logprobs,
-                                              temperature=0.0,
-                                              detokenize=detokenize)
-        vllm_results = vllm_model.llm.generate(
-            example_prompts, sampling_params=vllm_sampling_params)
-
-    # Test whether logprobs are included in the results.
-    for result in vllm_results:
-        assert result.prompt_logprobs is not None
-        assert result.outputs[0].logprobs is not None
-        assert len(result.outputs[0].logprobs) == max_tokens
-        for logprobs in result.outputs[0].logprobs:
-            # If the output token is not included in the top X
-            # logprob, it can return 1 more data
-            assert (len(logprobs) == num_top_logprobs
-                    or len(logprobs) == num_top_logprobs + 1)
-        output_text = result.outputs[0].text
-        output_string_from_most_likely_tokens_lst: list[str] = []
-        for top_logprobs in result.outputs[0].logprobs:
-            top_logprob = next(iter(top_logprobs.values()))
-            output_string_from_most_likely_tokens_lst.append(
-                top_logprob.decoded_token)
-
-        if detokenize:
-            output_string_from_most_likely_tokens = "".join(
-                output_string_from_most_likely_tokens_lst)
-            assert output_text == output_string_from_most_likely_tokens, (
-                "The output text from the top logprob for each token position "
-                "should be the same as the output text in the result.")
-        else:
-            assert output_text == ''
-            assert output_string_from_most_likely_tokens_lst == ([None] *
-                                                                 max_tokens)
-
-        # The first prompt logprob is always None
-        assert result.prompt_logprobs[0] is None
-        for prompt_logprobs in result.prompt_logprobs[1:]:
-            # If the prompt token is not included in the top X
-            # logprob, it can return 1 more data
-            assert (len(prompt_logprobs) == num_top_logprobs
-                    or len(prompt_logprobs) == num_top_logprobs + 1)
-
-    # Test whether prompt logprobs are consistent with HF
-    for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
-        # Check prompt logprobs
-        # The first prompt logprob is always None, so we compare it from 1:.
-        vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
-        for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
-            for token_id, logprob in vllm_prompt_logprob_dict.items():
-                torch.testing.assert_close(logprob.logprob,
-                                           hf_logprob[0][i][token_id].item(),
-                                           atol=1e-2,
-                                           rtol=1e-2)
-        vllm_sample_logprobs = vllm_result.outputs[0].logprobs
-        for i, top_logprobs in enumerate(vllm_sample_logprobs):
-            for token_id, sample_logprob in top_logprobs.items():
-                logprob = sample_logprob.logprob
-                torch.testing.assert_close(logprob,
-                                           hf_logprob[i][-1][token_id].item(),
-                                           atol=1e-2,
-                                           rtol=1e-2)
-                if detokenize:
-                    assert isinstance(sample_logprob.decoded_token, str), (
-                        "The token should be decoded by the time it is returned"
-                        " to the user.")
-
-    # Test if prompt logprobs are correctly set.
-    for vllm_result in vllm_results:
-        token_ids = vllm_result.prompt_token_ids
-        prompt_logprobs = vllm_result.prompt_logprobs
-
-        # The first token doesn't have logprob.
-        assert prompt_logprobs[0] is None
-
-        for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):
-            assert token_id in logprob_dict
-
-
-def test_max_logprobs():
-    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
-    vllm_sampling_params = SamplingParams(logprobs=1)
-    # should pass
-    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
-
-    bad_sampling_params = SamplingParams(logprobs=2)
-    with pytest.raises(ValueError):
-        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
-@pytest.mark.parametrize("detokenize", [True, False])
-def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
-                       detokenize: bool, example_prompts):
-    max_num_seqs = 256
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
-        max_num_batched_tokens = chunked_prefill_token_size
-    max_tokens = 5
-
-    with vllm_runner(
-            model,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
-                                                       logprobs=None,
-                                                       temperature=0.0,
-                                                       detokenize=detokenize)
-        results_logprobs_none = vllm_model.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_none)
-
-    for i in range(len(results_logprobs_none)):
-        assert results_logprobs_none[i].outputs[0].logprobs is None
-        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
diff --git a/tests/worker/__init__.py b/tests/worker/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py
deleted file mode 100644
index 3f202d4dbe948..0000000000000
--- a/tests/worker/conftest.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
deleted file mode 100644
index 0f28ef2ba857b..0000000000000
--- a/tests/worker/test_model_input.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-
-import torch
-
-from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.model_executor import SamplingMetadata
-from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-
-
-class MockAttentionBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        raise NotImplementedError
-
-    @staticmethod
-    def get_impl_cls():
-        raise NotImplementedError
-
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return AttentionMetadata
-
-    @staticmethod
-    def get_builder_cls() -> type["AttentionMetadataBuilder"]:
-        return AttentionMetadataBuilder
-
-    @staticmethod
-    def get_state_cls() -> type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> tuple[int, ...]:
-        raise NotImplementedError
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        pass
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: list[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        pass
-
-
-def test_model_runner_input():
-    sampling_metadata = SamplingMetadata(
-        ["seq_group"],
-        "selected_token_indices",
-        "categorized_sample_indices",
-        "num_prompts",
-    )
-    attn_metadata = AttentionMetadata(
-        num_prefills=1,
-        num_prefill_tokens=2,
-        num_decode_tokens=3,
-        slot_mapping=torch.zeros(1),
-        multi_modal_placeholder_index_maps=None,
-        enable_kv_scales_calculation=True,
-    )
-    model_input = ModelInputForGPUWithSamplingMetadata(
-        input_tokens=torch.ones(10),
-        input_positions=torch.ones(10),
-        sampling_metadata=sampling_metadata,
-        attn_metadata=attn_metadata)
-
-    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
-
-    # Test round trip serialization.
-    tensor_dict = model_input.as_broadcastable_tensor_dict()
-    attn_backend = MockAttentionBackend()
-    received_model_input = (
-        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict, attn_backend=attn_backend))
-    # Check that received copy has correct values.
-    assert isinstance(received_model_input,
-                      ModelInputForGPUWithSamplingMetadata)
-    assert received_model_input.input_tokens is not None
-    assert (
-        received_model_input.input_tokens == model_input.input_tokens).all()
-    assert received_model_input.input_positions is not None
-    assert (received_model_input.input_positions == model_input.input_positions
-            ).all()
-    assert received_model_input.multi_modal_kwargs is None
-    assert (received_model_input.multi_modal_kwargs ==
-            model_input.multi_modal_kwargs)
-    assert received_model_input.lora_requests is None
-    assert received_model_input.lora_requests == model_input.lora_requests
-    assert received_model_input.lora_mapping is None
-    assert received_model_input.lora_mapping == model_input.lora_mapping
-    for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_model_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
-    # For sampling metadata, only selected_token_indices is copied.
-    assert (received_model_input.sampling_metadata.selected_token_indices ==
-            sampling_metadata.selected_token_indices)
-    assert received_model_input.sampling_metadata.seq_groups is None
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
deleted file mode 100644
index 0be25aa2fc35d..0000000000000
--- a/tests/worker/test_model_runner.py
+++ /dev/null
@@ -1,462 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
-from vllm.engine.arg_utils import EngineArgs
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import get_open_port
-from vllm.worker.model_runner import ModelRunner
-
-
-def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
-    engine_args = EngineArgs(model, *args, **kwargs)
-    engine_config = engine_args.create_engine_config()
-    model_runner = ModelRunner(
-        vllm_config=engine_config,
-        is_driver_worker=True,
-    )
-    return model_runner
-
-
-def test_deepseek_mla_attn_backend_module():
-    model_runner = _create_model_runner(
-        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
-        trust_remote_code=True,
-        enable_chunked_prefill=False,
-    )
-    assert model_runner.attn_backend.__name__ == "TritonMLABackend"
-
-
-@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
-@pytest.mark.parametrize("use_prompt_embeds", [True, False])
-def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
-    if use_prompt_embeds:
-        # Prompt Embeddings is only currently supported on V0
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=False,
-        enable_prompt_embeds=True,
-    )
-
-    seq_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    block_tables = {0: [1]}
-    expected_input_embeds_len = 0
-    for i in range(batch_size):
-        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        seq_lens.append(seq_len)
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * seq_len,
-                prompt_embeds=torch.rand(seq_len, 10),
-            )
-            expected_input_embeds_len += seq_len
-        else:
-            seq_data = SequenceData.from_seqs(prompt_token_ids=range(seq_len))
-
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=True,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables=block_tables,
-        )
-        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
-        seq_group_metadata_list.append(seq_group_metadata)
-
-    expected_selected_token_indices = []
-    selected_token_start_idx = 0
-    for seq_len in seq_lens:
-        expected_selected_token_indices.append(selected_token_start_idx +
-                                               seq_len - 1)
-        selected_token_start_idx += seq_len
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-    return_seq_lens = model_input.seq_lens
-    slot_mapping = attn_metadata.slot_mapping
-    assert return_seq_lens == seq_lens
-    assert len(slot_mapping) == len(input_tokens)
-
-    # Verify input metadata is correct for prompts.
-    device = model_runner.device
-    assert attn_metadata.num_prefills > 0
-    assert attn_metadata.num_decode_tokens == 0
-    torch.testing.assert_close(
-        attn_metadata.seq_lens_tensor,
-        torch.tensor(seq_lens, device=device, dtype=torch.int))
-    assert attn_metadata.seq_lens == seq_lens
-    assert attn_metadata.max_prefill_seq_len == max(seq_lens)
-    assert attn_metadata.max_decode_seq_len == 0
-
-    # Test subquery start locs.
-    start_idx = 0
-    start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        start_loc.append(start_idx)
-    torch.testing.assert_close(
-        attn_metadata.query_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
-
-    # Test seq start locs. Note that for normal prefill it is
-    # equivalent to query_start_loc.
-    start_idx = 0
-    seq_start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        seq_start_loc.append(start_idx)
-
-    torch.testing.assert_close(
-        attn_metadata.seq_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
-    torch.testing.assert_close(
-        attn_metadata.context_lens_tensor,
-        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
-                    dtype=torch.int,
-                    device=device))
-
-    expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))],
-                            dtype=torch.int32,
-                            device=model_runner.device)
-    torch.testing.assert_close(attn_metadata.block_tables, expected)
-    # Cuda graph should not be used for prerill.
-    assert attn_metadata.use_cuda_graph is False
-
-    assert len(input_tokens) == sum(seq_lens)
-    assert len(input_positions) == sum(seq_lens)
-    if expected_input_embeds_len == 0:
-        torch.testing.assert_close(input_tokens, input_positions)
-        assert input_embeds is None
-    else:
-        assert len(input_embeds) == expected_input_embeds_len
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=model_runner.device,
-        pin_memory=model_runner.pin_memory)
-    assert len(input_tokens) == sum(seq_lens)
-    assert len(input_positions) == sum(seq_lens)
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
-    torch.testing.assert_close(actual, expected)
-    torch.allclose(input_tokens, input_positions)
-
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
-    torch.testing.assert_close(actual, expected)
-
-
-@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
-@pytest.mark.parametrize("use_prompt_embeds", [True, False])
-def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
-    if use_prompt_embeds:
-        # Prompt Embeddings is only currently supported on V0
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        enforce_eager=False,
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=False,
-        enable_prompt_embeds=True,
-    )
-
-    context_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    # Assume each seq group finishes prefill.
-    for i in range(batch_size):
-        # make sure all tokens fit into one block
-        context_len = i % (model_runner.block_size - 1) + 1
-        context_lens.append(context_len)
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * context_len,
-                prompt_embeds=torch.rand(context_len, 10),
-            )
-            output_embed = torch.rand(10)
-        else:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(context_len))
-            output_embed = None
-        seq_data.update_num_computed_tokens(context_len)
-        # Append one token ID since prefill is finished.
-        seq_data.append_token_id(1, 0, output_embed)
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=False,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables={0: [1]},
-        )
-        assert seq_group_metadata.token_chunk_size == 1
-        seq_group_metadata_list.append(seq_group_metadata)
-
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-    slot_mapping = attn_metadata.slot_mapping
-
-    assert len(slot_mapping) == len(input_tokens)
-
-    expected_bs = model_runner.vllm_config.pad_for_cudagraph(
-        len(seq_group_metadata_list))
-    # Verify input metadata is correct for prompts.
-    device = model_runner.device
-    assert attn_metadata.num_prefills == 0
-    assert attn_metadata.num_prefill_tokens == 0
-    seq_lens = [context_len + 1 for context_len in context_lens]
-    # seq_lens are padded to expected_bs
-    for _ in range(expected_bs - len(seq_lens)):
-        seq_lens.append(1)
-    assert attn_metadata.seq_lens == seq_lens
-    assert attn_metadata.num_decode_tokens == len(seq_lens)
-    start_idx = 0
-    start_loc = [start_idx]
-    for _ in context_lens:
-        # decode has only 1 token for query.
-        start_idx += 1
-        start_loc.append(start_idx)
-    torch.testing.assert_close(
-        attn_metadata.query_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
-
-    start_idx = 0
-    seq_start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        seq_start_loc.append(start_idx)
-    torch.testing.assert_close(
-        attn_metadata.seq_start_loc,
-        torch.tensor(seq_start_loc, dtype=torch.int32, device=device))
-
-    torch.testing.assert_close(
-        attn_metadata.context_lens_tensor,
-        torch.tensor(context_lens, dtype=torch.int, device=device))
-    assert attn_metadata.max_decode_seq_len == max(seq_lens)
-    torch.testing.assert_close(
-        attn_metadata.seq_lens_tensor[:len(seq_lens)],
-        torch.tensor(seq_lens, dtype=torch.int, device=device))
-
-    # block table's first index corresponds to each batch, meaning in
-    # decoding it is each token.
-    assert attn_metadata.block_tables.shape[0] == len(input_tokens)
-    # Block table's second dim corresponds to each token's block number.
-    # It is padded up to
-    assert attn_metadata.block_tables.shape[1] == (
-        model_runner.get_max_block_per_batch())
-    assert attn_metadata.use_cuda_graph is True
-
-    assert len(input_tokens) == expected_bs
-    assert len(input_positions) == expected_bs
-    if use_prompt_embeds:
-        expected_input_embeds_length = start_loc[-1]
-        assert len(input_embeds) == expected_input_embeds_length
-        assert expected_input_embeds_length <= expected_bs
-    else:
-        assert input_embeds is None
-
-    # Verify Sampling
-    expected_selected_token_indices = []
-    for selected_token_start_idx, _ in enumerate(context_lens):
-        expected_selected_token_indices.append(selected_token_start_idx)
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        # query lens is all 1 for decode.
-        query_lens=[1 for _ in range(len(context_lens))],
-        device=model_runner.device,
-        pin_memory=model_runner.pin_memory)
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
-    torch.testing.assert_close(actual, expected)
-
-
-def test_empty_seq_group():
-    """Verify prepare prompt and decode returns empty output."""
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        enforce_eager=False,
-    )
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    attn_metadata = model_input.attn_metadata
-
-    assert input_tokens is None
-    assert input_positions is None
-    assert attn_metadata is None
-
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-    return_seq_lens = model_input.seq_lens
-
-    assert input_tokens is None
-    assert input_positions is None
-    assert input_embeds is None
-    assert attn_metadata is None
-    assert return_seq_lens is None
-
-
-@pytest.fixture
-def distributed_init():
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
-        local_rank=0)
-    ensure_model_parallel_initialized(1, 1)
-
-
-@pytest.mark.parametrize("batch_size", list(range(2, 128, 3)))
-@pytest.mark.parametrize("enforce_eager", [True, False])
-@pytest.mark.parametrize('use_prompt_embeds', [True, False])
-def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds,
-                        distributed_init, monkeypatch):
-    if use_prompt_embeds:
-        # Prompt Embeddings is only currently supported on V0
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        enforce_eager=enforce_eager,
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=True,
-        enable_prompt_embeds=True,
-    )
-
-    # Add prefill requests.
-    seq_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    prefill_metadata_list: list[SequenceGroupMetadata] = []
-    decode_metadata_list: list[SequenceGroupMetadata] = []
-    block_tables = {0: [1]}
-    prefill_batch_size = batch_size // 2
-    decode_batch_size = batch_size - prefill_batch_size
-    expected_input_embeds_len = 0
-    for i in range(prefill_batch_size):
-        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        seq_lens.append(seq_len)
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * seq_len,
-                prompt_embeds=torch.rand(seq_len, 10),
-            )
-            expected_input_embeds_len += seq_len
-        else:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(seq_len), )
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=True,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables=block_tables,
-        )
-        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
-        seq_group_metadata_list.append(seq_group_metadata)
-        prefill_metadata_list.append(seq_group_metadata)
-
-    # Add decode requests
-    for i in range(prefill_batch_size, batch_size):
-        # make sure all tokens fit into one block
-        context_len = i % (model_runner.block_size - 1) + 1
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * context_len,
-                prompt_embeds=torch.rand(context_len, 10),
-            )
-            output_embed = torch.rand(10)
-            # This also iterates the expected input_embeds, because the model
-            # needs both the input and output embeddings passed into together
-            expected_input_embeds_len += 1
-        else:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(context_len), )
-            output_embed = None
-        assert len(seq_data.prompt_token_ids) == context_len
-        seq_data.append_token_id(1, 0, output_embed)
-        seq_data.update_num_computed_tokens(context_len)
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=False,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables={0: [1]},
-        )
-        assert seq_group_metadata.token_chunk_size == 1
-        seq_group_metadata_list.append(seq_group_metadata)
-        decode_metadata_list.append(seq_group_metadata)
-
-    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
-
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-
-    prefill_meta_actual = attn_metadata.prefill_metadata
-    decode_meta_actual = attn_metadata.decode_metadata
-
-    assert len(attn_metadata.slot_mapping) == len(input_tokens)
-    assert len(input_positions) == len(input_tokens)
-    assert attn_metadata.num_prefills == prefill_batch_size
-    assert attn_metadata.num_decode_tokens == decode_batch_size
-    assert attn_metadata.num_prefill_tokens == sum(seq_lens)
-    if expected_input_embeds_len == 0:
-        assert input_embeds is None
-    else:
-        assert len(input_embeds) == expected_input_embeds_len
-
-    # Verify attn metadata is consistent. We don't need to test individual
-    # values here because they are tested above.
-    attn_metadata = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list).attn_metadata
-
-    for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
-                                          vars(prefill_meta_actual)):
-        assert attr_expected[1] == attr_actual[1]
-    for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata),
-                                          vars(decode_meta_actual)):
-        assert attr_expected[1] == attr_actual[1]
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
deleted file mode 100644
index d8767f700b576..0000000000000
--- a/tests/worker/test_profile.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.worker import Worker
-
-
-def test_gpu_memory_profiling():
-    # Tests the gpu profiling that happens in order to determine the number of
-    # KV cache blocks that we can allocate on the GPU.
-    # This test mocks the maximum available gpu memory so that it can run on
-    # any gpu setup.
-
-    # Set up engine args to build a worker.
-    engine_args = EngineArgs(model="facebook/opt-125m",
-                             dtype="half",
-                             load_format="dummy")
-    engine_config = engine_args.create_engine_config()
-    engine_config.cache_config.num_gpu_blocks = 1000
-    engine_config.cache_config.num_cpu_blocks = 1000
-
-    # Create the worker.
-    distributed_init_method = get_distributed_init_method(
-        get_ip(), get_open_port())
-    worker = Worker(
-        vllm_config=engine_config,
-        local_rank=0,
-        rank=0,
-        distributed_init_method=distributed_init_method,
-        is_driver_worker=True,
-    )
-
-    # Set 10GiB as the total gpu ram to be device-agnostic
-    def mock_mem_info():
-        current_usage = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
-        mock_total_bytes = 10 * 1024**3
-        free = mock_total_bytes - current_usage
-
-        return (free, mock_total_bytes)
-
-    from unittest.mock import patch
-    with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
-        # Load the model so we can profile it
-        worker.init_device()
-        worker.load_model()
-        gpu_blocks, _ = worker.determine_num_available_blocks()
-
-    # Peak vram usage by torch should be 0.47 GiB
-    # Model weights take 0.25 GiB
-    # No memory should be allocated outside of torch
-    # 9.0 GiB should be the utilization target
-    # 8.28 GiB should be available for the KV cache
-    block_size = CacheEngine.get_cache_block_size(
-        engine_config.cache_config, engine_config.model_config,
-        engine_config.parallel_config)
-
-    expected_blocks = (8.28 * 1024**3) // block_size
-
-    # Check within a small tolerance for portability
-    # Hardware, kernel, or dependency changes could all affect memory
-    # utilization.
-    # A 100 block tolerance here should be about 60MB of wiggle room.
-    assert abs(gpu_blocks - expected_blocks) < 100
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
deleted file mode 100644
index 6d9f404ac207b..0000000000000
--- a/tests/worker/test_swap.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.worker.worker import Worker
-
-
-def test_swap() -> None:
-    # Configure the engine.
-    engine_args = EngineArgs(model="distilbert/distilgpt2",
-                             dtype="half",
-                             load_format="dummy")
-    engine_config = engine_args.create_engine_config()
-    engine_config.cache_config.num_gpu_blocks = 1000
-    engine_config.cache_config.num_cpu_blocks = 1000
-
-    # Create the worker.
-    distributed_init_method = get_distributed_init_method(
-        get_ip(), get_open_port())
-    worker = Worker(
-        vllm_config=engine_config,
-        local_rank=0,
-        rank=0,
-        distributed_init_method=distributed_init_method,
-        is_driver_worker=True,
-    )
-
-    # Initialize the worker.
-    worker.init_device()
-    worker.load_model()
-    worker.initialize_cache(
-        num_gpu_blocks=engine_config.cache_config.num_gpu_blocks,
-        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
-
-    # Randomly initialize the cache.
-    gpu_cache = worker.cache_engine[0].gpu_cache
-    cpu_cache = worker.cache_engine[0].cpu_cache
-    num_layers = len(gpu_cache)
-    for i in range(num_layers):
-        gpu_key_cache, gpu_value_cache = gpu_cache[i]
-        gpu_key_cache.random_()
-        gpu_value_cache.random_()
-        cpu_key_cache, cpu_value_cache = cpu_cache[i]
-        cpu_key_cache.random_()
-        cpu_value_cache.random_()
-
-    allclose = lambda a, b: torch.allclose(
-        a.cuda(), b.cuda(), rtol=0.0, atol=0.0)
-
-    # Test swap out.
-    blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)]
-    execute_model_req = ExecuteModelRequest(
-        seq_group_metadata_list=[],
-        blocks_to_swap_in=[],
-        blocks_to_swap_out=blocks_to_swap_out,
-        blocks_to_copy=[],
-    )
-    worker.execute_model(execute_model_req=execute_model_req)
-
-    for i in range(num_layers):
-        gpu_key_cache, gpu_value_cache = gpu_cache[i]
-        cpu_key_cache, cpu_value_cache = cpu_cache[i]
-        for src, dst in blocks_to_swap_out:
-            assert allclose(gpu_key_cache[src], cpu_key_cache[dst])
-            assert allclose(gpu_value_cache[src], cpu_value_cache[dst])
-
-    # Test swap in.
-    execute_model_req.blocks_to_swap_out = []
-    execute_model_req.blocks_to_swap_in = [
-        (19, 45),
-        (67, 23),
-        (12, 78),
-        (40, 99),
-        (1, 71),
-    ]
-    worker.execute_model(execute_model_req=execute_model_req)
-
-    for i in range(num_layers):
-        gpu_key_cache, gpu_value_cache = gpu_cache[i]
-        cpu_key_cache, cpu_value_cache = cpu_cache[i]
-        for src, dst in execute_model_req.blocks_to_swap_in:
-            assert allclose(gpu_key_cache[dst], cpu_key_cache[src])
-            assert allclose(gpu_value_cache[dst], cpu_value_cache[src])

From b7433ca1a47732394b1bdea4099d98389515954b Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Thu, 18 Sep 2025 01:07:24 -0400
Subject: [PATCH 11/58] [Spec Decode] Efficient padded speculation (#24539)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
---
 tests/v1/spec_decode/test_eagle.py | 179 +++++++++++++++++++-
 vllm/config/speculative.py         |   5 +
 vllm/v1/spec_decode/eagle.py       | 258 +++++++++++++++++++++++++----
 vllm/v1/worker/gpu_input_batch.py  |   5 +-
 vllm/v1/worker/gpu_model_runner.py | 164 +++++++++++-------
 5 files changed, 507 insertions(+), 104 deletions(-)

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index ccab04628a163..e7f6b68fc3f77 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -19,6 +19,8 @@ from vllm.config.load import LoadConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.platforms import current_platform
 from vllm.v1.spec_decode.eagle import EagleProposer
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 model_dir = "meta-llama/Llama-3.1-8B-Instruct"
 eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
@@ -64,6 +66,86 @@ def _create_proposer(
                          device=current_platform.device_type)
 
 
+def test_prepare_next_token_ids():
+    """
+    Test for prepare_next_token_ids_cpu and prepare_next_token_ids_padded.
+    Each will produce a device tensor of next_token_ids, taking as input
+    either the GPU tensor of sampled_token_ids with -1 for rejected tokens,
+    or the CPU python list[list[int]] with the rejected tokens removed.
+    """
+    device = torch.device(current_platform.device_type)
+
+    num_requests = 4
+    num_speculative_tokens = 4
+    batch_spec = BatchSpec(
+        seq_lens=[num_speculative_tokens + 1] * num_requests,
+        query_lens=[num_speculative_tokens + 1] * num_requests,
+    )
+
+    req_ids = [f"req_{i+1}" for i in range(num_requests)]
+    mock_input_batch = mock.MagicMock(spec=InputBatch)
+    mock_input_batch.req_ids = req_ids
+    mock_input_batch.num_reqs = num_requests
+    mock_input_batch.vocab_size = 100
+
+    mock_num_scheduled_tokens = {req_id: 0 for req_id in req_ids}
+    mock_requests = {}
+    for req_id in req_ids:
+        mock_request = mock.MagicMock(spec=CachedRequestState)
+        # Each request will have a backup next token id of 10, 20, 30, 40
+        mock_request.get_token_id.return_value = int(req_id.split("_")[1]) * 10
+        mock_request.num_computed_tokens = 0
+        mock_requests[req_id] = mock_request
+
+    sampled_token_ids = [
+        [0, 1, -1, -1, -1],  # 1 accepted, 3 rejected, "1" sampled
+        [0, 1, 2, 3, 4],  # all accepted, "4" sampled
+        [-1, -1, -1, -1, -1],  # sampling skipped, use backup token "30"
+        [-1, -1, -1, -1, -1]  # this request will be discarded
+    ]
+    sampled_token_ids_tensor = torch.tensor(sampled_token_ids,
+                                            dtype=torch.int32,
+                                            device=device)
+    sampled_token_ids_cpu = [[i for i in seq if i != -1]
+                             for seq in sampled_token_ids]
+
+    expected_next_token_ids_cpu = [1, 4, 30, 40]
+    expected_next_token_ids_tensor = torch.tensor(expected_next_token_ids_cpu,
+                                                  dtype=torch.int32,
+                                                  device=device)
+
+    proposer = _create_proposer("eagle", num_speculative_tokens)
+
+    next_token_ids_from_cpu = proposer.prepare_next_token_ids_cpu(
+        sampled_token_ids_cpu, mock_requests, mock_input_batch,
+        mock_num_scheduled_tokens)
+
+    assert torch.equal(next_token_ids_from_cpu, expected_next_token_ids_tensor)
+
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec,
+        block_size=16,
+        device=device,
+    )
+
+    discarded_req_indices = torch.tensor([3], dtype=torch.int64, device=device)
+    num_discarded_reqs = 1
+
+    expected_valid_sampled_tokens_count = torch.tensor([2, 5, 0, 0],
+                                                       dtype=torch.int32,
+                                                       device=device)
+
+    next_token_ids_from_padded, valid_sampled_tokens_count = \
+        proposer.prepare_next_token_ids_padded(
+            common_attn_metadata, sampled_token_ids_tensor, mock_requests,
+            mock_input_batch, discarded_req_indices, num_discarded_reqs)
+
+    assert torch.equal(next_token_ids_from_padded,
+                       expected_next_token_ids_tensor)
+    assert torch.equal(valid_sampled_tokens_count,
+                       expected_valid_sampled_tokens_count)
+
+
 def test_prepare_inputs():
     """
     cu_target_query_lens: [0, a, a + b, a + b + c]
@@ -90,10 +172,24 @@ def test_prepare_inputs():
         device=device,
     )
 
-    # Rejected tokens per request: [1, 3, 2]
-    num_rejected_tokens = torch.tensor([1, 3, 2],
-                                       dtype=torch.int32,
-                                       device=device)
+    # If there are `k` sampled tokens, then `k-1` tokens are draft tokens
+    # from the previous iteration, and the last token is the bonus token sampled
+    # from the base model.
+    num_draft_tokens = [3, 6, 4]  # one less than query_lens
+    # num rejected tokens is [1, 3, 2]
+    ACCEPT_TOKEN = 0
+    BONUS_TOKEN = 1
+    REJECT_TOKEN = -1
+    sampled_token_ids = [
+        [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, BONUS_TOKEN],
+        [
+            ACCEPT_TOKEN, ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN,
+            REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN
+        ],
+        [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN]
+    ]
+    sampled_token_ids = [[i for i in seq if i != REJECT_TOKEN]
+                         for seq in sampled_token_ids]
 
     # Expected calculations:
     # query_len_per_req = [4, 7, 5]
@@ -125,7 +221,7 @@ def test_prepare_inputs():
     proposer = _create_proposer("eagle", 1)
 
     updated_metadata, token_indices = proposer.prepare_inputs(
-        common_attn_metadata, num_rejected_tokens.cpu())
+        common_attn_metadata, sampled_token_ids, num_draft_tokens)
 
     assert torch.equal(updated_metadata.query_start_loc,
                        expected_cu_num_tokens)
@@ -133,6 +229,77 @@ def test_prepare_inputs():
     assert torch.equal(token_indices, expected_token_indices)
 
 
+def test_prepare_inputs_padded():
+    """
+    Input scenario is 3 requests with num_speculative_tokens == 2 and:
+    - Request 1: query_len = 3, rejected = 1
+    - Request 2: query_len = 3, rejected = 0
+    - Request 3: query_len = 3, rejected = 2
+
+    Expected outputs:
+    token_indices: [0, 1, 2,
+                    3, 4, 5,
+                    6, 7, 8]
+    Reason: Deferred computation should not disturb the original indices.
+
+    token_indices_to_sample: [1, 5, 6]
+    Reason: After accounting for rejections, these are the valid token positions
+            from the original indices to sample from.
+    """
+
+    device = torch.device(current_platform.device_type)
+
+    expected_token_indices = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8],
+                                          dtype=torch.int32,
+                                          device=device)
+    expected_token_indices_to_sample = torch.tensor([1, 5, 6],
+                                                    dtype=torch.int32,
+                                                    device=device)
+
+    num_speculative_tokens = 2
+    batch_spec = BatchSpec(
+        seq_lens=[3, 3, 3],
+        query_lens=[3, 3, 3],
+    )
+
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec,
+        block_size=16,
+        device=device,
+    )
+
+    # Needed for cu_num_draft_tokens, which is expected to be [3, 6, 9]
+    expected_query_start_loc = torch.tensor([0, 3, 6, 9],
+                                            dtype=torch.int32,
+                                            device=device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        draft_token_ids=[[0] * num_speculative_tokens] * 3,
+        device=device,
+    )
+
+    # num_rejected_tokens = [1, 0, 2]
+    # num_draft_tokens = [2, 2, 2]
+    # valid_sampled_tokens_count = num_draft_tokens + 1 - num_rejected_tokens
+    valid_sampled_tokens_count = torch.tensor([2, 3, 1],
+                                              dtype=torch.int32,
+                                              device=device)
+
+    proposer = _create_proposer("eagle", num_speculative_tokens)
+
+    output_metadata, token_indices, token_indices_to_sample = \
+        proposer.prepare_inputs_padded(
+            common_attn_metadata,
+            spec_decode_metadata,
+            valid_sampled_tokens_count)
+
+    assert output_metadata.max_query_len == 3
+    assert torch.equal(output_metadata.query_start_loc,
+                       expected_query_start_loc)
+    assert torch.equal(token_indices, expected_token_indices)
+    assert torch.equal(token_indices_to_sample,
+                       expected_token_indices_to_sample)
+
+
 @pytest.mark.parametrize("method", ["eagle", "eagle3"])
 @pytest.mark.parametrize("attn_backend",
                          get_attn_backend_list_based_on_platform())
@@ -373,6 +540,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
                               target_positions=target_positions,
                               target_hidden_states=target_hidden_states,
                               next_token_ids=next_token_ids,
+                              last_token_indices=None,
                               common_attn_metadata=common_attn_metadata,
                               sampling_metadata=sampling_metadata)
 
@@ -526,6 +694,7 @@ def test_propose_tree(spec_token_tree):
                               target_positions=target_positions,
                               target_hidden_states=target_hidden_states,
                               next_token_ids=next_token_ids,
+                              last_token_indices=None,
                               common_attn_metadata=common_attn_metadata,
                               sampling_metadata=sampling_metadata)
     assert result.shape == (batch_size, num_speculative_tokens)
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index b2d50e3852337..fca8c28e5c61e 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -83,6 +83,11 @@ class SpeculativeConfig:
     disable_by_batch_size: Optional[int] = None
     """Disable speculative decoding for new incoming requests when the number
     of enqueued requests is larger than this value, if provided."""
+    disable_padded_drafter_batch: bool = False
+    """Disable input padding for speculative decoding. If set to True,
+    speculative input batches can contain sequences of different lengths,
+    which may only be supported by certain attention backends. This currently
+    only affects the EAGLE method of speculation."""
 
     # Ngram proposer configuration
     prompt_lookup_max: Optional[int] = None
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 5154b29405b6e..2a178ddf48777 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -27,6 +27,9 @@ from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.ubatching import dbo_current_ubatch_id
 
 logger = init_logger(__name__)
@@ -94,20 +97,26 @@ class EagleProposer:
             dtype=self.dtype,
             device=device)
 
+        # We need +1 here because the arange is used to set query_start_loc,
+        # which has one more element than batch_size.
         max_batch_size = vllm_config.scheduler_config.max_num_seqs
-        self.arange = torch.arange(
-            # We need +1 here because the arange is used to set query_start_loc,
-            # which has one more element than batch_size.
-            max_batch_size + 1,
-            device=device,
-            dtype=torch.int32,
-        )
+        max_num_slots_for_arange = max(max_batch_size + 1, self.max_num_tokens)
+        self.arange = torch.arange(max_num_slots_for_arange,
+                                   device=device,
+                                   dtype=torch.int32)
 
         self.inputs_embeds = torch.zeros(
             (self.max_num_tokens, self.hidden_size),
             dtype=self.dtype,
             device=device)
 
+        self.backup_next_token_ids = CpuGpuBuffer(
+            max_batch_size,
+            dtype=torch.int32,
+            pin_memory=is_pin_memory_available(),
+            device=device,
+            with_numpy=True)
+
         # Determine allowed attention backends once during initialization.
         self.allowed_attn_types: tuple[type[EagleAttentionMetadata], ...]
         if current_platform.is_rocm():
@@ -156,13 +165,16 @@ class EagleProposer:
         target_hidden_states: torch.Tensor,
         # [batch_size]
         next_token_ids: torch.Tensor,
+        last_token_indices: Optional[torch.Tensor],
         common_attn_metadata: CommonAttentionMetadata,
         sampling_metadata: SamplingMetadata,
         mm_embeds: Optional[list[torch.Tensor]] = None,
     ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
-        last_token_indices = common_attn_metadata.query_start_loc[1:] - 1
+
+        if last_token_indices is None:
+            last_token_indices = common_attn_metadata.query_start_loc[1:] - 1
 
         if self.method == "eagle3":
             assert isinstance(self.model, Eagle3LlamaForCausalLM)
@@ -228,6 +240,12 @@ class EagleProposer:
                 last_hidden_states, hidden_states = ret_hidden_states
         sample_hidden_states = last_hidden_states[last_token_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
+
+        # Early exit if there is only one draft token to be generated.
+        if self.num_speculative_tokens == 1:
+            draft_token_ids = logits.argmax(dim=-1)
+            return draft_token_ids.view(-1, 1)
+
         positions = target_positions[last_token_indices]
         hidden_states = hidden_states[last_token_indices]
 
@@ -245,15 +263,12 @@ class EagleProposer:
 
         draft_token_ids = logits.argmax(dim=-1)
 
-        # Early exit if there is only one draft token to be generated.
-        if self.num_speculative_tokens == 1:
-            # [batch_size, 1]
-            return draft_token_ids.view(-1, 1)
-
-        # TODO: Currently, MTP module released by deepseek only has
-        # one layer. Adapt this code to support multiple layers once
-        # there's a multi-layer MTP module.
-        assert isinstance(attn_metadata, self.allowed_attn_types)
+        if not isinstance(attn_metadata, self.allowed_attn_types):
+            raise ValueError(
+                f"Unsupported attention metadata type for speculative "
+                "decoding with num_speculative_tokens > 1: "
+                f"{type(attn_metadata)}. Supported types are: "
+                f"{self.allowed_attn_types}")
 
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
@@ -263,10 +278,13 @@ class EagleProposer:
             input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
         else:
             input_batch_size = batch_size
-        attn_metadata.num_actual_tokens = batch_size
-        attn_metadata.max_query_len = 1
-        attn_metadata.query_start_loc = self.arange[:batch_size + 1]
-        for _ in range(self.num_speculative_tokens - 1):
+
+        common_attn_metadata.num_actual_tokens = batch_size
+        common_attn_metadata.max_query_len = 1
+        common_attn_metadata.query_start_loc = self.arange[:batch_size + 1]
+        common_attn_metadata.query_start_loc_cpu = torch.from_numpy(
+            self.token_arange_np[:batch_size + 1]).clone()
+        for token_index in range(self.num_speculative_tokens - 1):
             # Update the inputs.
             # cast to int32 is crucial when eagle model is compiled.
             # tensor.argmax() returns int64 by default.
@@ -286,27 +304,38 @@ class EagleProposer:
                                             positions)
 
             # Increment the sequence lengths.
-            attn_metadata.max_seq_len += 1
-            attn_metadata.seq_lens += 1
-            # Consider max model length.
-            attn_metadata.max_seq_len = min(attn_metadata.max_seq_len,
-                                            self.max_model_len)
+            common_attn_metadata.seq_lens += 1
+            common_attn_metadata.seq_lens_cpu += 1
             # For the requests that exceed the max model length, we set the
             # sequence length to 1 to minimize their overheads in attention.
-            attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
+            common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len,
+                                                       1)
+
+            common_attn_metadata.num_computed_tokens_cpu = \
+                common_attn_metadata.seq_lens_cpu - 1
 
             # Compute the slot mapping.
             block_numbers = clamped_positions // self.block_size
-            block_ids = attn_metadata.block_table.gather(
+            block_ids = common_attn_metadata.block_table_tensor.gather(
                 dim=1, index=block_numbers.view(-1, 1))
             block_ids = block_ids.view(-1)
-            attn_metadata.slot_mapping = (block_ids * self.block_size +
-                                          clamped_positions % self.block_size)
+            common_attn_metadata.slot_mapping = (
+                block_ids * self.block_size +
+                clamped_positions % self.block_size)
             # Mask out the slot mappings that exceed the max model length.
             # Otherwise, the KV cache will be inadvertently updated with the
             # padding tokens.
-            attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len,
-                                                    PADDING_SLOT_ID)
+            common_attn_metadata.slot_mapping.masked_fill_(
+                exceeds_max_model_len, PADDING_SLOT_ID)
+
+            # Rebuild attention metadata
+            attn_metadata_builder = \
+                self.runner.attn_groups[0][0].metadata_builders[ubatch_id]
+            attn_metadata = attn_metadata_builder\
+                .build_for_drafting(common_attn_metadata=common_attn_metadata,
+                                draft_index=token_index + 1)
+            for layer_name in self.attn_layer_names:
+                per_layer_attn_metadata[layer_name] = attn_metadata
 
             # copy inputs to buffer for cudagraph
             self.input_ids[:batch_size] = input_ids
@@ -347,6 +376,158 @@ class EagleProposer:
         draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
         return draft_token_ids
 
+    def prepare_next_token_ids_cpu(
+            self, sampled_token_ids: list[list[int]],
+            requests: dict[str,
+                           CachedRequestState], gpu_input_batch: InputBatch,
+            num_scheduled_tokens: dict[str, int]) -> torch.Tensor:
+        """
+        This function is used to prepare the inputs for speculative decoding.
+        It calculates the next token ids for each request based on the sampled
+        token ids from the CPU. If a request has no sampled token ids (e.g.,
+        during the initial decoding steps), it falls back to using the request
+        state to get the next token id.
+        """
+        req_ids = gpu_input_batch.req_ids
+        next_token_ids: list[int] = []
+        for i, token_ids in enumerate(sampled_token_ids):
+            if token_ids:
+                # Common case.
+                next_token_id = token_ids[-1]
+            else:
+                # Partial prefill (rare case).
+                # Get the next token id from the request state.
+                req_id = req_ids[i]
+                req_state = requests[req_id]
+                seq_len = (req_state.num_computed_tokens +
+                           num_scheduled_tokens[req_id])
+                next_token_id = req_state.get_token_id(seq_len)
+            next_token_ids.append(next_token_id)
+        next_token_ids = torch.tensor(next_token_ids,
+                                      dtype=torch.int32,
+                                      device=self.input_ids.device)
+        return next_token_ids
+
+    def prepare_next_token_ids_padded(self,
+                               common_attn_metadata: CommonAttentionMetadata,
+                               sampled_token_ids: torch.Tensor,
+                               requests: dict[str, CachedRequestState],
+                               gpu_input_batch: InputBatch,
+                               discard_request_indices: torch.Tensor,
+                               num_discarded_requests: int) -> \
+                                tuple[torch.Tensor, torch.Tensor]:
+        """
+        This function is used to prepare the inputs for speculative decoding.
+        It calculates the next token ids and the number of valid sampled tokens
+        for each request, considering the "discarded" requests whose next token
+        is not sampled and comes from `request.get_token_id()` instead.
+        It also accounts for the rejected tokens in `sampled_token_ids`.
+        This function must use device functions to operate on the inputs, and
+        should not introduce any blocking CPU-GPU synchronization.
+        """
+        # TODO(Ben): Combine this into a custom fused kernel
+
+        # Precompute get_token_id for when there is no valid next token
+        num_reqs = gpu_input_batch.num_reqs
+        self.backup_next_token_ids.np[:num_reqs] = np.array([
+            requests[gpu_input_batch.req_ids[i]].get_token_id(
+                common_attn_metadata.seq_lens_cpu[i].item())
+            for i in range(num_reqs)
+        ])
+        self.backup_next_token_ids.copy_to_gpu(num_reqs)
+
+        # Mask out the sampled tokens indices that should not be sampled.
+        discard_sampled_tokens_req_indices = \
+            discard_request_indices[:num_discarded_requests]
+
+        valid_sampled_token_ids_gpu = sampled_token_ids.clone()
+        valid_sampled_token_ids_gpu.index_fill_(
+            0, discard_sampled_tokens_req_indices, -1)
+
+        # Generate a mask for all valid tokens within those requests
+        max_gen_len = sampled_token_ids.shape[-1]
+        if max_gen_len == 1:
+            valid_mask = torch.ones_like(valid_sampled_token_ids_gpu,
+                                         dtype=torch.bool)
+        else:
+            valid_mask = (
+                (valid_sampled_token_ids_gpu != -1) &
+                (valid_sampled_token_ids_gpu < gpu_input_batch.vocab_size))
+
+        # Count the number of valid tokens in each request
+        valid_sampled_tokens_count = valid_mask.sum(dim=1)
+
+        # Get the rightmost valid index per row
+        last_valid_indices = valid_sampled_tokens_count - 1
+        last_valid_indices_safe = torch.clamp(last_valid_indices, min=0)
+
+        # Get last valid token from each row
+        # (assume undefined state where there is no valid token)
+        selected_tokens = torch.gather(
+            valid_sampled_token_ids_gpu, 1,
+            last_valid_indices_safe.unsqueeze(1)).squeeze(1)
+
+        # Use last token if valid, pre-computed backup if not
+        batch_size = valid_sampled_token_ids_gpu.shape[0]
+        next_token_ids = torch.where(
+            last_valid_indices != -1, selected_tokens,
+            self.backup_next_token_ids.gpu[:batch_size])
+
+        return next_token_ids, valid_sampled_tokens_count
+
+    def prepare_inputs_padded(self,
+                                common_attn_metadata: CommonAttentionMetadata,
+                                spec_decode_metadata: SpecDecodeMetadata,
+                                valid_sampled_tokens_count: torch.Tensor) -> \
+                    tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]:
+        """
+        This function is used to prepare the inputs for speculative decoding
+        It updates the common_attn_metadata for speculative decoding,
+        but does not consider the rejected tokens. Instead, all tokens
+        are included as inputs to the speculator, with the rejected tokens
+        used as padding and filtered out later by `token_indices_to_sample`.
+        No blocking CPU operations should be introduced in this function.
+        """
+        num_draft_tokens_gpu = torch.cat([
+            spec_decode_metadata.cu_num_draft_tokens[0:1],
+            spec_decode_metadata.cu_num_draft_tokens[1:] -
+            spec_decode_metadata.cu_num_draft_tokens[:-1]
+        ])
+
+        num_rejected_tokens_gpu = torch.where(
+            num_draft_tokens_gpu > 0,
+            num_draft_tokens_gpu + 1 - valid_sampled_tokens_count,
+            torch.zeros_like(num_draft_tokens_gpu))
+
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+
+        new_query_len_per_req = (query_start_loc_cpu[1:] -
+                                 query_start_loc_cpu[:-1])
+
+        total_num_tokens = query_start_loc_cpu[-1].item()
+        token_indices = self.arange[:total_num_tokens]
+
+        spec_common_attn_metadata = CommonAttentionMetadata(
+            query_start_loc=common_attn_metadata.query_start_loc,
+            seq_lens=common_attn_metadata.seq_lens,
+            query_start_loc_cpu=query_start_loc_cpu,
+            seq_lens_cpu=common_attn_metadata.seq_lens_cpu,
+            num_computed_tokens_cpu=common_attn_metadata.
+            num_computed_tokens_cpu,
+            num_reqs=common_attn_metadata.num_reqs,
+            num_actual_tokens=total_num_tokens,
+            max_query_len=new_query_len_per_req.max().item(),
+            max_seq_len=common_attn_metadata.seq_lens_cpu.max().item(),
+            block_table_tensor=common_attn_metadata.block_table_tensor,
+            slot_mapping=common_attn_metadata.slot_mapping[token_indices],
+            causal=True,
+        )
+
+        token_indices_to_sample = common_attn_metadata.query_start_loc[1:] - 1 \
+            - num_rejected_tokens_gpu
+
+        return spec_common_attn_metadata, token_indices, token_indices_to_sample
+
     def propose_tree(
         self,
         batch_size: int,
@@ -520,11 +701,11 @@ class EagleProposer:
     def prepare_inputs(
         self,
         common_attn_metadata: CommonAttentionMetadata,
-        # [batch_size]
-        num_rejected_tokens: torch.Tensor
+        sampled_token_ids: list[list[int]],
+        num_draft_tokens: list[int],
     ) -> tuple[CommonAttentionMetadata, torch.Tensor]:
         """
-        This function is used to prepare the inputs for the spec decode.
+        This function is used to prepare the inputs for speculative decoding.
         It updates to the common_attn_metadata to account for the rejected
         tokens (and newly sampled tokens). It also returns the token indices
         of the tokens that should be fed to the speculator.
@@ -545,6 +726,13 @@ class EagleProposer:
         #                 q1, q1 + 1, ..., q1 + q2 - n2 - 1,
         #                 q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1]
 
+        num_rejected_tokens = [
+            n + 1 - len(sampled_token_ids[i]) if n > 0 else 0
+            for i, n in enumerate(num_draft_tokens)
+        ]
+        num_rejected_tokens = torch.tensor(num_rejected_tokens,
+                                           dtype=torch.int32)
+
         device = common_attn_metadata.query_start_loc.device
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
         new_seq_lens_cpu = common_attn_metadata.seq_lens_cpu \
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 339b9937b73f4..6717622efb801 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -64,7 +64,10 @@ class CachedRequestState:
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:
             return self.prompt_token_ids[idx]
-        return self.output_token_ids[idx - self.num_prompt_tokens]
+        elif idx - self.num_prompt_tokens < len(self.output_token_ids):
+            return self.output_token_ids[idx - self.num_prompt_tokens]
+        else:
+            return -1
 
 
 class InputBatch:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f256dc160a6b5..e8ad9c2fca07c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -344,6 +344,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                                self.hidden_size,
                                                dtype=self.dtype,
                                                numpy=False)
+        self.discard_request_indices = self._make_buffer(self.max_num_reqs,
+                                                         dtype=torch.int64)
+        self.num_discarded_requests = 0
+
         self.num_draft_tokens = self._make_buffer(self.max_num_reqs,
                                                   dtype=torch.int32)
         self.num_accepted_tokens = self._make_buffer(self.max_num_reqs,
@@ -974,6 +978,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         seq_lens = self.seq_lens.gpu[:num_reqs]
         max_seq_len = self.seq_lens.np[:num_reqs].max().item()
 
+        num_tokens = [
+            self.requests[r].num_tokens for r in self.input_batch.req_ids
+        ]
+        num_tokens_np = np.array(num_tokens, dtype=np.int32)
+
+        # Record the index of requests that should not be sampled,
+        # so that we could clear the sampled tokens before returning
+        discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np
+        discard_request_indices = np.nonzero(discard_requests_mask)[0]
+        self.num_discarded_requests = len(discard_request_indices)
+        self.discard_request_indices.np[:self.num_discarded_requests] = (
+            discard_request_indices)
+
+        self.discard_request_indices.copy_to_gpu(self.num_discarded_requests)
+
         # Copy the tensors to the GPU.
         self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
 
@@ -1973,23 +1992,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
             num_nans_in_logits = self._get_nans_in_logits(logits)
 
-        # TODO(woosuk): The following loop can be slow since it iterates over
-        # the requests one by one. Optimize.
-        discard_sampled_tokens_req_indices = []
-        for i, req_id in enumerate(self.input_batch.req_ids):
-            req_state = self.requests[req_id]
-            seq_len = (req_state.num_computed_tokens +
-                       scheduler_output.num_scheduled_tokens[req_id])
-            if seq_len < req_state.num_tokens:
-                # Ignore the sampled token for partial prefills.
-                # Rewind the generator state as if the token was not sampled.
-                # This relies on cuda-specific torch-internal impl details
-                generator = self.input_batch.generators.get(i)
-                if generator is not None:
-                    generator.set_offset(generator.get_offset() - 4)
-                # Record the index of the request that should not be sampled,
-                # so that we could clear the sampled tokens before returning.
-                discard_sampled_tokens_req_indices.append(i)
+        discard_sampled_tokens_req_indices = \
+            self.discard_request_indices.np[:self.num_discarded_requests]
+        for i in discard_sampled_tokens_req_indices:
+            gen = self.input_batch.generators.get(int(i))
+            if gen is not None:
+                gen.set_offset(gen.get_offset() - 4)
 
         # Copy some objects so they don't get modified after returning.
         # This is important when using async scheduling.
@@ -2026,10 +2034,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 )
             # Mask out the sampled tokens that should not be sampled.
             for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i].clear()
+                valid_sampled_token_ids[int(i)].clear()
         else:
             valid_sampled_token_ids = []
-            invalid_req_indices = list(discard_sampled_tokens_req_indices)
+            invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
             invalid_req_indices_set = set(invalid_req_indices)
             assert sampled_token_ids.shape[-1] == 1
 
@@ -2229,6 +2237,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         with record_function_or_nullcontext("Sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
+        def propose_draft_token_ids(sampled_token_ids):
+            assert spec_decode_common_attn_metadata is not None
+            with record_function_or_nullcontext("Draft"):
+                self._draft_token_ids = self.propose_draft_token_ids(
+                    scheduler_output,
+                    sampled_token_ids,
+                    self.input_batch.sampling_metadata,
+                    hidden_states,
+                    sample_hidden_states,
+                    aux_hidden_states,
+                    spec_decode_metadata,
+                    spec_decode_common_attn_metadata,
+                )
+
+        use_padded_batch_for_eagle = self.speculative_config and \
+            self.speculative_config.use_eagle() and \
+            not self.speculative_config.disable_padded_drafter_batch
+        if use_padded_batch_for_eagle:
+            # EAGLE speculative decoding can use the GPU sampled tokens
+            # as inputs, and does not need to wait for bookkeeping to finish.
+            propose_draft_token_ids(sampler_output.sampled_token_ids)
+
         with record_function_or_nullcontext("Bookkeep"):
             (
                 num_nans_in_logits,
@@ -2242,19 +2272,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                        logits, hidden_states,
                                        num_scheduled_tokens)
 
-        if self.speculative_config:
-            assert spec_decode_common_attn_metadata is not None
-            with record_function_or_nullcontext("Draft"):
-                self._draft_token_ids = self.propose_draft_token_ids(
-                    scheduler_output,
-                    valid_sampled_token_ids,
-                    self.input_batch.sampling_metadata,
-                    hidden_states,
-                    sample_hidden_states,
-                    aux_hidden_states,
-                    spec_decode_metadata,
-                    spec_decode_common_attn_metadata,
-                )
+        if self.speculative_config and not use_padded_batch_for_eagle:
+            # ngram and other speculative decoding methods use the sampled
+            # tokens on the CPU, so they are run after bookkeeping.
+            propose_draft_token_ids(valid_sampled_token_ids)
 
         with record_function_or_nullcontext("EPLB"):
             self.eplb_step()
@@ -2294,7 +2315,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
-        sampled_token_ids: list[list[int]],
+        sampled_token_ids: Union[torch.Tensor, list[list[int]]],
         sampling_metadata: SamplingMetadata,
         hidden_states: torch.Tensor,
         sample_hidden_states: torch.Tensor,
@@ -2304,11 +2325,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     ) -> Union[list[list[int]], torch.Tensor]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if self.speculative_config.method == "ngram":
+            assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, NgramProposer)
             draft_token_ids = self.propose_ngram_draft_token_ids(
                 sampled_token_ids)
         elif self.speculative_config.method == "medusa":
+            assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, MedusaProposer)
+
             if sample_hidden_states.shape[0] == len(sampled_token_ids):
                 # The input to the target model does not include draft tokens.
                 hidden_states = sample_hidden_states
@@ -2329,27 +2353,37 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             )
         elif self.speculative_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
-            # TODO(woosuk): Refactor the loop.
-            req_ids = self.input_batch.req_ids
-            next_token_ids: list[int] = []
-            for i, token_ids in enumerate(sampled_token_ids):
-                if token_ids:
-                    # Common case.
-                    next_token_id = token_ids[-1]
-                else:
-                    # Partial prefill (rare case).
-                    # Get the next token id from the request state.
-                    req_id = req_ids[i]
-                    req_state = self.requests[req_id]
-                    seq_len = (req_state.num_computed_tokens +
-                               scheduler_output.num_scheduled_tokens[req_id])
-                    next_token_id = req_state.get_token_id(seq_len)
-                next_token_ids.append(next_token_id)
-            next_token_ids = torch.tensor(next_token_ids,
-                                          dtype=torch.int32,
-                                          device=self.device)
+
+            if self.speculative_config.disable_padded_drafter_batch:
+                # When padded-batch is disabled, the sampled_token_ids should be
+                # the cpu-side list[list[int]] of valid sampled tokens for each
+                # request, with invalid requests having empty lists.
+                assert isinstance(sampled_token_ids, list), \
+                    "sampled_token_ids should be a python list when" \
+                    "padded-batch is disabled."
+                next_token_ids = self.drafter.prepare_next_token_ids_cpu(
+                    sampled_token_ids, self.requests, self.input_batch,
+                    scheduler_output.num_scheduled_tokens)
+            else:
+                # When using padded-batch, the sampled_token_ids should be
+                # the gpu tensor of sampled tokens for each request, of shape
+                # (num_reqs, num_spec_tokens + 1) with rejected tokens having
+                # value -1.
+                assert isinstance(sampled_token_ids, torch.Tensor), \
+                    "sampled_token_ids should be a torch.Tensor when" \
+                    "padded-batch is enabled."
+                next_token_ids, valid_sampled_tokens_count = \
+                    self.drafter.prepare_next_token_ids_padded(
+                        common_attn_metadata,
+                        sampled_token_ids,
+                        self.requests,
+                        self.input_batch,
+                        self.discard_request_indices.gpu,
+                        self.num_discarded_requests
+                    )
 
             if spec_decode_metadata is None:
+                token_indices_to_sample = None
                 # input_ids can be None for multimodal models.
                 target_token_ids = self.input_ids.gpu[:num_scheduled_tokens]
                 # TODO(woosuk): Support M-RoPE.
@@ -2361,17 +2395,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 else:
                     target_hidden_states = hidden_states[:num_scheduled_tokens]
             else:
-                # TODO(woosuk): Refactor this.
-                num_draft_tokens = spec_decode_metadata.num_draft_tokens
-                num_rejected_tokens = [
-                    n + 1 - len(sampled_token_ids[i]) if n > 0 else 0
-                    for i, n in enumerate(num_draft_tokens)
-                ]
-                num_rejected_tokens_cpu = torch.tensor(num_rejected_tokens,
-                                                       dtype=torch.int32)
-                common_attn_metadata, token_indices =\
-                    self.drafter.prepare_inputs(
-                    common_attn_metadata, num_rejected_tokens_cpu)
+                if self.speculative_config.disable_padded_drafter_batch:
+                    token_indices_to_sample = None
+                    common_attn_metadata, token_indices =\
+                        self.drafter.prepare_inputs(
+                            common_attn_metadata,
+                            sampled_token_ids,
+                            spec_decode_metadata.num_draft_tokens)
+                else:
+                    common_attn_metadata, token_indices, \
+                        token_indices_to_sample =\
+                        self.drafter.prepare_inputs_padded(
+                            common_attn_metadata,
+                            spec_decode_metadata,
+                            valid_sampled_tokens_count)
 
                 target_token_ids = self.input_ids.gpu[token_indices]
                 # TODO(woosuk): Support M-RoPE.
@@ -2391,6 +2428,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 target_positions=target_positions,
                 target_hidden_states=target_hidden_states,
                 next_token_ids=next_token_ids,
+                last_token_indices=token_indices_to_sample,
                 sampling_metadata=sampling_metadata,
                 common_attn_metadata=common_attn_metadata,
                 mm_embeds=mm_embeds,

From a904ea78eaf7fc3f9b136a1ba6f6f66fb5658496 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 17 Sep 2025 22:30:02 -0700
Subject: [PATCH 12/58] [benchmark] add peak throughput metrics and plot
 (#23867)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 vllm/benchmarks/lib/endpoint_request_func.py |   5 +
 vllm/benchmarks/serve.py                     | 198 ++++++++++++-------
 2 files changed, 134 insertions(+), 69 deletions(-)

diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index e640630476630..066b8fe834380 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -89,6 +89,7 @@ class RequestFuncOutput:
     tpot: float = 0.0  # avg next-token latencies
     prompt_len: int = 0
     error: str = ""
+    start_time: float = 0.0
 
 
 async def async_request_openai_completions(
@@ -140,6 +141,7 @@ async def async_request_openai_completions(
 
     generated_text = ""
     st = time.perf_counter()
+    output.start_time = st
     most_recent_timestamp = st
     try:
         async with session.post(url=api_url, json=payload,
@@ -272,6 +274,7 @@ async def async_request_openai_chat_completions(
     generated_text = ""
     ttft = 0.0
     st = time.perf_counter()
+    output.start_time = st
     most_recent_timestamp = st
     try:
         async with session.post(url=api_url, json=payload,
@@ -396,6 +399,7 @@ async def async_request_openai_audio(
         generated_text = ""
         ttft = 0.0
         st = time.perf_counter()
+        output.start_time = st
         most_recent_timestamp = st
         try:
             async with session.post(url=api_url,
@@ -475,6 +479,7 @@ async def async_request_openai_embeddings(
 
     output = RequestFuncOutput()
     st = time.perf_counter()
+    output.start_time = st
     try:
         async with session.post(
             url=api_url,
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 1aeef0fd5bd85..d8784340eba15 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -18,9 +18,11 @@ On the client side, run:
 import argparse
 import asyncio
 import gc
+import importlib.util
 import json
 import os
 import random
+import shutil
 import time
 import warnings
 from collections.abc import AsyncGenerator, Iterable
@@ -46,6 +48,9 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
+TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None)
+                          and (shutil.which("gnuplot") is not None))
+
 
 class TaskType(Enum):
     GENERATION = "generation"
@@ -80,18 +85,23 @@ class BenchmarkMetrics:
     median_e2el_ms: float
     std_e2el_ms: float
     percentiles_e2el_ms: list[tuple[float, float]]
+    # Max output tokens per second and concurrent requests at that peak
+    max_output_tokens_per_s: float
+    max_concurrent_requests: int
+
 
 @dataclass
 class EmbedBenchmarkMetrics:
     completed: int
     total_input: int
     request_throughput: float
-    total_token_throughput :float
+    total_token_throughput: float
     mean_e2el_ms: float
     std_e2el_ms: float
     median_e2el_ms: float
     percentiles_e2el_ms: float
 
+
 def _get_current_request_rate(
     ramp_up_strategy: Optional[Literal["linear", "exponential"]],
     ramp_up_start_rps: Optional[int],
@@ -150,8 +160,8 @@ async def get_request(
     assert burstiness > 0, (
         f"A positive burstiness factor is expected, but given {burstiness}.")
     # Convert to list to get length for ramp-up calculations
-    if isinstance(input_requests, Iterable) and not isinstance(
-            input_requests, list):
+    if isinstance(input_requests,
+                  Iterable) and not isinstance(input_requests, list):
         input_requests = list(input_requests)
 
     total_requests = len(input_requests)
@@ -161,12 +171,9 @@ async def get_request(
     request_rates = []
     delay_ts = []
     for request_index, request in enumerate(input_requests):
-        current_request_rate = _get_current_request_rate(ramp_up_strategy,
-                                                         ramp_up_start_rps,
-                                                         ramp_up_end_rps,
-                                                         request_index,
-                                                         total_requests,
-                                                         request_rate)
+        current_request_rate = _get_current_request_rate(
+            ramp_up_strategy, ramp_up_start_rps, ramp_up_end_rps,
+            request_index, total_requests, request_rate)
         request_rates.append(current_request_rate)
         if current_request_rate == float("inf"):
             delay_ts.append(0)
@@ -206,10 +213,8 @@ async def get_request(
 
 
 def calculate_metrics_for_embeddings(
-    outputs: list[RequestFuncOutput], 
-    dur_s: float, 
-    selected_percentiles: list[float]
-) -> EmbedBenchmarkMetrics:
+        outputs: list[RequestFuncOutput], dur_s: float,
+        selected_percentiles: list[float]) -> EmbedBenchmarkMetrics:
     """Calculate the metrics for the embedding requests.
 
     Args:
@@ -242,10 +247,8 @@ def calculate_metrics_for_embeddings(
         mean_e2el_ms=np.mean(e2els or 0) * 1000,
         std_e2el_ms=np.std(e2els or 0) * 1000,
         median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[
-            (p, np.percentile(e2els or 0, p) * 1000) 
-            for p in selected_percentiles
-        ],
+        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
+                             for p in selected_percentiles],
     )
     return metrics
 
@@ -336,6 +339,67 @@ def calculate_metrics(
             "All requests failed. This is likely due to a misconfiguration "
             "on the benchmark arguments.",
             stacklevel=2)
+
+    # Calculate max output tokens per second metric
+    max_output_tokens_per_s = 0.0
+    max_concurrent_requests = 0
+
+    # Find the time range across all successful requests
+    successful_outputs = [output for output in outputs if output.success]
+    if successful_outputs:
+        min_start_time = min(output.start_time
+                             for output in successful_outputs)
+        max_end_time = max(output.start_time + output.latency
+                           for output in successful_outputs)
+
+        # Create second buckets (ceiling to ensure we capture all time)
+        duration_seconds = int(np.ceil(max_end_time - min_start_time)) + 1
+        tokens_per_second = np.zeros(duration_seconds)
+        concurrent_requests_per_second = np.zeros(duration_seconds)
+
+        for i, output in enumerate(successful_outputs):
+            # Calculate token generation timestamp using
+            # start_time, ttft, and itl
+            token_times = [output.start_time + output.ttft]
+            current_time = token_times[0]
+            for itl_value in output.itl:
+                current_time += itl_value
+                token_times.append(current_time)
+
+            # Add tokens to second buckets
+            for token_time in token_times:
+                second_bucket = int(token_time - min_start_time)
+                if 0 <= second_bucket < duration_seconds:
+                    tokens_per_second[second_bucket] += 1
+
+            # Track concurrent requests for each second this request was active
+            request_start_second = int(output.start_time - min_start_time)
+            request_end_second = int((output.start_time + output.latency) -
+                                     min_start_time)
+
+            for second in range(request_start_second, request_end_second + 1):
+                concurrent_requests_per_second[second] += 1
+
+        # Find the maximum tokens per second and corresponding
+        # concurrent requests
+        if len(tokens_per_second) > 0:
+            max_output_tokens_per_s = float(np.max(tokens_per_second))
+            max_concurrent_requests = int(
+                np.max(concurrent_requests_per_second))
+
+        if TERM_PLOTLIB_AVAILABLE:
+            import termplotlib as tpl
+            fig = tpl.figure()
+            fig.plot(np.arange(len(tokens_per_second)),
+                     tokens_per_second,
+                     title="Output tokens per second")
+            fig.plot(np.arange(len(concurrent_requests_per_second)),
+                     concurrent_requests_per_second,
+                     title="Concurrent requests per second")
+            fig.show()
+        else:
+            print("tip: install termplotlib and gnuplot to plot the metrics")
+
     metrics = BenchmarkMetrics(
         completed=completed,
         total_input=total_input,
@@ -365,6 +429,8 @@ def calculate_metrics(
         median_e2el_ms=np.median(e2els or 0) * 1000,
         percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
                              for p in selected_percentiles],
+        max_output_tokens_per_s=max_output_tokens_per_s,
+        max_concurrent_requests=max_concurrent_requests,
     )
 
     return metrics, actual_output_lens
@@ -396,11 +462,8 @@ async def benchmark(
     ramp_up_end_rps: Optional[int] = None,
     ready_check_timeout_sec: int = 600,
 ):
-    task_type = (
-        TaskType.EMBEDDING
-        if api_url.endswith("/v1/embeddings")
-        else TaskType.GENERATION
-    )
+    task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else
+                 TaskType.GENERATION)
     if endpoint_type in ASYNC_REQUEST_FUNCS:
         if task_type == TaskType.EMBEDDING:
             request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"]
@@ -435,14 +498,10 @@ async def benchmark(
         input_requests[0].multi_modal_data,
     )
 
-    assert (
-        test_mm_content is None
-        or isinstance(test_mm_content, dict)
-        or (
-            isinstance(test_mm_content, list)
-            and all(isinstance(item, dict) for item in test_mm_content)
-        )
-    ), "multi_modal_data must be a dict or list[dict]"
+    assert (test_mm_content is None or isinstance(test_mm_content, dict)
+            or (isinstance(test_mm_content, list)
+                and all(isinstance(item, dict) for item in test_mm_content))
+            ), "multi_modal_data must be a dict or list[dict]"
     test_input = RequestFuncInput(
         model=model_id,
         model_name=model_name,
@@ -488,13 +547,13 @@ async def benchmark(
                                          ignore_eos=ignore_eos,
                                          extra_headers=extra_headers,
                                          extra_body=extra_body)
-        profile_output = await request_func(
-            request_func_input=profile_input, session=session)
+        profile_output = await request_func(request_func_input=profile_input,
+                                            session=session)
         if profile_output.success:
             print("Profiler started")
 
-    distribution = ("Poisson process" if burstiness == 1.0
-                    else "Gamma distribution")
+    distribution = ("Poisson process"
+                    if burstiness == 1.0 else "Gamma distribution")
 
     if ramp_up_strategy is not None:
         print(f"Traffic ramp-up strategy: {ramp_up_strategy}.")
@@ -562,18 +621,20 @@ async def benchmark(
             req_lora_module = next(lora_modules)
             req_model_id, req_model_name = req_lora_module, req_lora_module
 
-        request_func_input = RequestFuncInput(model=req_model_id,
-                                              model_name=req_model_name,
-                                              prompt=prompt,
-                                              api_url=api_url,
-                                              prompt_len=prompt_len,
-                                              output_len=output_len,
-                                              logprobs=logprobs,
-                                              multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos,
-                                              extra_headers=extra_headers,
-                                              extra_body=extra_body,
-                                              request_id=request_id,)
+        request_func_input = RequestFuncInput(
+            model=req_model_id,
+            model_name=req_model_name,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            logprobs=logprobs,
+            multi_modal_content=mm_content,
+            ignore_eos=ignore_eos,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+            request_id=request_id,
+        )
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
@@ -615,19 +676,21 @@ async def benchmark(
                                     benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
     if isinstance(metrics, BenchmarkMetrics):
-        print("{:<40} {:<10}".format(
-            "Total generated tokens:", metrics.total_output))
+        print("{:<40} {:<10}".format("Total generated tokens:",
+                                     metrics.total_output))
     print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                     metrics.request_throughput))
     if goodput_config_dict:
         print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
                                         metrics.request_goodput))
     if isinstance(metrics, BenchmarkMetrics):
-        print(
-            "{:<40} {:<10.2f}".format(
-                "Output token throughput (tok/s):", metrics.output_throughput
-            )
-        )
+        print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                        metrics.output_throughput))
+        print("{:<40} {:<10.2f}".format(
+            "Peak output token throughput (tok/s):",
+            metrics.max_output_tokens_per_s))
+        print("{:<40} {:<10.2f}".format("Peak concurrent requests:",
+                                        metrics.max_concurrent_requests))
     print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
                                     metrics.total_token_throughput))
 
@@ -648,6 +711,8 @@ async def benchmark(
             "itls": [output.itl for output in outputs],
             "generated_texts": [output.generated_text for output in outputs],
             "errors": [output.error for output in outputs],
+            "max_output_tokens_per_s": metrics.max_output_tokens_per_s,
+            "max_concurrent_requests": metrics.max_concurrent_requests,
         }
     else:
         result = {
@@ -697,8 +762,8 @@ async def benchmark(
 
     if task_type == TaskType.GENERATION:
         process_one_metric("ttft", "TTFT", "Time to First Token")
-        process_one_metric(
-            "tpot", "TPOT", "Time per Output Token (excl. 1st token)")
+        process_one_metric("tpot", "TPOT",
+                           "Time per Output Token (excl. 1st token)")
         process_one_metric("itl", "ITL", "Inter-token Latency")
     process_one_metric("e2el", "E2EL", "End-to-end Latency")
 
@@ -714,8 +779,8 @@ async def benchmark(
             output_len=test_output_len,
             logprobs=logprobs,
         )
-        profile_output = await request_func(
-            request_func_input=profile_input, session=session)
+        profile_output = await request_func(request_func_input=profile_input,
+                                            session=session)
         if profile_output.success:
             print("Profiler stopped")
 
@@ -851,7 +916,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--tokenizer",
         type=str,
-        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
@@ -982,7 +1048,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Specify the prefix of request id.",
     )
 
-
     sampling_group = parser.add_argument_group("sampling parameters")
     sampling_group.add_argument(
         "--top-p",
@@ -1047,8 +1112,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="The ramp-up strategy. This would be used to "
         "ramp up the request rate from initial RPS to final "
         "RPS rate (specified by --ramp-up-start-rps and "
-        "--ramp-up-end-rps.) over the duration of the benchmark."
-    )
+        "--ramp-up-end-rps.) over the duration of the benchmark.")
     parser.add_argument(
         "--ramp-up-start-rps",
         type=int,
@@ -1087,13 +1151,11 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             raise ValueError(
                 "When using ramp-up, do not specify --request-rate. "
                 "The request rate will be controlled by ramp-up parameters. "
-                "Please remove the --request-rate argument."
-            )
+                "Please remove the --request-rate argument.")
         if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
             raise ValueError(
                 "When using --ramp-up-strategy, both --ramp-up-start-rps and "
-                "--ramp-up-end-rps must be specified"
-            )
+                "--ramp-up-end-rps must be specified")
         if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
             raise ValueError("Ramp-up start and end RPS must be non-negative")
         if args.ramp_up_start_rps > args.ramp_up_end_rps:
@@ -1127,8 +1189,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
                 headers[kvstring[0].strip()] = kvstring[1].strip()
             else:
                 raise ValueError(
-                    "Invalid header format. Please use KEY=VALUE format."
-                )
+                    "Invalid header format. Please use KEY=VALUE format.")
 
     tokenizer = get_tokenizer(tokenizer_id,
                               tokenizer_mode=tokenizer_mode,
@@ -1215,8 +1276,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
                 result_json[kvstring[0].strip()] = kvstring[1].strip()
             else:
                 raise ValueError(
-                    "Invalid metadata format. Please use KEY=VALUE format."
-                )
+                    "Invalid metadata format. Please use KEY=VALUE format.")
 
     # Traffic
     result_json["request_rate"] = (args.request_rate if args.request_rate

From e111d5b0ae9359e2a829771105e739d36505fa69 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 17 Sep 2025 22:30:26 -0700
Subject: [PATCH 13/58] [CLI] Use streaming in CLI chat and completion commands
 (#23769)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 vllm/entrypoints/cli/openai.py | 71 +++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 26 deletions(-)

diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 7c01de94a3436..1929d6a7f77af 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -45,6 +45,28 @@ def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]:
     return model_name, openai_client
 
 
+def _print_chat_stream(stream) -> str:
+    output = ""
+    for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.content:
+            output += delta.content
+            print(delta.content, end="", flush=True)
+    print()
+    return output
+
+
+def _print_completion_stream(stream) -> str:
+    output = ""
+    for chunk in stream:
+        text = chunk.choices[0].text
+        if text is not None:
+            output += text
+            print(text, end="", flush=True)
+    print()
+    return output
+
+
 def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None:
     conversation: list[ChatCompletionMessageParam] = []
     if system_prompt is not None:
@@ -58,14 +80,11 @@ def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None:
             break
         conversation.append({"role": "user", "content": input_message})
 
-        chat_completion = client.chat.completions.create(model=model_name,
-                                                         messages=conversation)
-
-        response_message = chat_completion.choices[0].message
-        output = response_message.content
-
-        conversation.append(response_message)  # type: ignore
-        print(output)
+        stream = client.chat.completions.create(model=model_name,
+                                                messages=conversation,
+                                                stream=True)
+        output = _print_chat_stream(stream)
+        conversation.append({"role": "assistant", "content": output})
 
 
 def _add_query_options(
@@ -108,9 +127,11 @@ class ChatCommand(CLISubcommand):
         if args.quick:
             conversation.append({"role": "user", "content": args.quick})
 
-            chat_completion = client.chat.completions.create(
-                model=model_name, messages=conversation)
-            print(chat_completion.choices[0].message.content)
+            stream = client.chat.completions.create(model=model_name,
+                                                    messages=conversation,
+                                                    stream=True)
+            output = _print_chat_stream(stream)
+            conversation.append({"role": "assistant", "content": output})
             return
 
         print("Please enter a message for the chat model:")
@@ -121,14 +142,11 @@ class ChatCommand(CLISubcommand):
                 break
             conversation.append({"role": "user", "content": input_message})
 
-            chat_completion = client.chat.completions.create(
-                model=model_name, messages=conversation)
-
-            response_message = chat_completion.choices[0].message
-            output = response_message.content
-
-            conversation.append(response_message)  # type: ignore
-            print(output)
+            stream = client.chat.completions.create(model=model_name,
+                                                    messages=conversation,
+                                                    stream=True)
+            output = _print_chat_stream(stream)
+            conversation.append({"role": "assistant", "content": output})
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@@ -168,9 +186,10 @@ class CompleteCommand(CLISubcommand):
         model_name, client = _interactive_cli(args)
 
         if args.quick:
-            completion = client.completions.create(model=model_name,
-                                                   prompt=args.quick)
-            print(completion.choices[0].text)
+            stream = client.completions.create(model=model_name,
+                                               prompt=args.quick,
+                                               stream=True)
+            _print_completion_stream(stream)
             return
 
         print("Please enter prompt to complete:")
@@ -179,10 +198,10 @@ class CompleteCommand(CLISubcommand):
                 input_prompt = input("> ")
             except EOFError:
                 break
-            completion = client.completions.create(model=model_name,
-                                                   prompt=input_prompt)
-            output = completion.choices[0].text
-            print(output)
+            stream = client.completions.create(model=model_name,
+                                               prompt=input_prompt,
+                                               stream=True)
+            _print_completion_stream(stream)
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:

From 81b16a2bc922e837267db7216a274c4d89a2cc0c Mon Sep 17 00:00:00 2001
From: Lumina <starry.qvq@gmail.com>
Date: Thu, 18 Sep 2025 13:53:55 +0800
Subject: [PATCH 14/58] [Kernel] Better inf handling for grouped topk cu
 (#24886)

Signed-off-by: lumina37 <starry.qvq@gmail.com>
---
 csrc/moe/grouped_topk_kernels.cu | 44 +++++++++++++++++---------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu
index accbb09858fac..b5321f748e6be 100644
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@@ -21,6 +21,7 @@
 #include <torch/all.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda/std/limits>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 namespace cg = cooperative_groups;
@@ -28,7 +29,6 @@ namespace cg = cooperative_groups;
 namespace vllm {
 namespace moe {
 
-constexpr float kNegInfinity = INFINITY * -1;
 constexpr unsigned FULL_WARP_MASK = 0xffffffff;
 constexpr int32_t WARP_SIZE = 32;
 constexpr int32_t BLOCK_SIZE = 512;
@@ -411,14 +411,21 @@ __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
   return __bfloat162float(val);
 }
 
+template <typename T>
+__device__ inline T neg_inf() {
+  // cuda::std::numeric_limits<T>::infinity() returns `0` for [T=bf16 or fp16]
+  // so we need to cast from fp32
+  return cuda_cast<T, float>(-cuda::std::numeric_limits<float>::infinity());
+}
+
 template <typename T>
 __device__ void topk_with_k2(T* output, T const* input,
                              cg::thread_block_tile<32> const& tile,
                              int32_t const lane_id,
                              int const num_experts_per_group) {
   // Get the top2 per thread
-  T largest = -INFINITY;
-  T second_largest = -INFINITY;
+  T largest = neg_inf<T>();
+  T second_largest = neg_inf<T>();
 
   if (num_experts_per_group > WARP_SIZE) {
     for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
@@ -513,8 +520,8 @@ __global__ void group_idx_and_topk_idx_kernel(
       warp_id * topk;
   s_topk_idx += warp_id * topk;
 
-  T value = kNegInfinity;
-  T topk_group_value = kNegInfinity;
+  T value = neg_inf<T>();
+  T topk_group_value = neg_inf<T>();
   int32_t num_equalto_topkth_group;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
@@ -525,11 +532,8 @@ __global__ void group_idx_and_topk_idx_kernel(
   if (case_id < num_tokens) {
     // calculate group_idx
     int32_t target_num_min = WARP_SIZE - n_group + topk_group;
-    if (lane_id < n_group &&
-        (isfinite(cuda_cast<float, T>(
-            group_scores[lane_id]))))  // The check is necessary to avoid
-                                       // abnormal input
-    {
+    // The check is necessary to avoid abnormal input
+    if (lane_id < n_group && cuda::std::isfinite(group_scores[lane_id])) {
       value = group_scores[lane_id];
     }
 
@@ -540,11 +544,11 @@ __global__ void group_idx_and_topk_idx_kernel(
       __syncwarp();  // Ensure all threads have valid data before reduction
       topk_group_value = cg::reduce(tile, value, cg::greater<T>());
       if (value == topk_group_value) {
-        value = kNegInfinity;
+        value = neg_inf<T>();
       }
       pre_count_equal_to_top_value = count_equal_to_top_value;
-      count_equal_to_top_value = __popc(__ballot_sync(
-          FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
+      count_equal_to_top_value =
+          __popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf<T>())));
     }
     num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
   }
@@ -552,11 +556,10 @@ __global__ void group_idx_and_topk_idx_kernel(
 
   warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
                         /* is_stable */ true>
-      queue((int32_t)topk, -INFINITY);
+      queue((int32_t)topk, neg_inf<T>());
 
   int count_equalto_topkth_group = 0;
-  bool if_proceed_next_topk =
-      (topk_group_value != cuda_cast<T, float>(kNegInfinity));
+  bool if_proceed_next_topk = topk_group_value != neg_inf<T>();
   if (case_id < num_tokens && if_proceed_next_topk) {
     for (int i_group = 0; i_group < n_group; i_group++) {
       if ((group_scores[i_group] > topk_group_value) ||
@@ -566,10 +569,10 @@ __global__ void group_idx_and_topk_idx_kernel(
         for (int32_t i = lane_id; i < align_num_experts_per_group;
              i += WARP_SIZE) {
           T candidates =
-              (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
-                                                 scores_with_bias[offset + i]))
+              (i < num_experts_per_group) &&
+                      cuda::std::isfinite(scores_with_bias[offset + i])
                   ? scores_with_bias[offset + i]
-                  : cuda_cast<T, float>(kNegInfinity);
+                  : neg_inf<T>();
           queue.add(candidates, offset + i);
         }
         if (group_scores[i_group] == topk_group_value) {
@@ -598,7 +601,8 @@ __global__ void group_idx_and_topk_idx_kernel(
       if (i < topk) {
         s_topk_value[i] = value;
       }
-      topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
+      topk_sum +=
+          cg::reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
     }
   }
 

From 349e0e34627950db1cc4be0df9a0bc616e210589 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 18 Sep 2025 07:23:29 +0100
Subject: [PATCH 15/58] [Docs] Fix API Reference (#25140)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 mkdocs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mkdocs.yaml b/mkdocs.yaml
index bbd850bdfee34..6f2be65a18af8 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -79,7 +79,7 @@ plugins:
         - "re:vllm\\._.*"  # Internal modules
         - "vllm.third_party"
         - "vllm.vllm_flash_attn"
-        - !ENV [API_AUTONAV_EXCLUDE, ""]
+        - !ENV [API_AUTONAV_EXCLUDE, "re:^$"]  # Match nothing by default
   - mkdocstrings:
       handlers:
         python:

From f4cd80f94404787859ba72dcddb5e818d8f0c9e7 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 18 Sep 2025 07:29:05 +0100
Subject: [PATCH 16/58] Retrieve `sliding_window` from text config in Gemma3 MM
 (#25085)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/gemma3_mm.py  | 3 ++-
 vllm/model_executor/models/gemma3n_mm.py | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index e652ba2f1c7fe..bee9fbd2c084a 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -688,7 +688,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
             global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
             global_attn_masks.append(global_attn_mask)
 
-            if (sliding_window := self.config.sliding_window) is not None:
+            sliding_window = self.config.text_config.sliding_window
+            if sliding_window is not None:
                 # Create a local causal mask with sliding window (1024).
                 local_attn_mask = torch.ones_like(global_attn_mask)
                 local_attn_mask = torch.tril(local_attn_mask,
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index 663d4da7cec23..8d3079aee0dfb 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -461,9 +461,6 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
         self.multimodal_config = multimodal_config
         self.vocab_size = config.text_config.vocab_size
 
-        self.sliding_window = getattr(config.text_config,
-                                      "interleaved_sliding_window", None)
-
         self.vision_tower = AutoModel.from_config(config=config.vision_config)
         self.audio_tower = AutoModel.from_config(config=config.audio_config)
         self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config,

From 350c94deb30747f84536ee34d91c6fca564667ce Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Thu, 18 Sep 2025 15:47:43 +0800
Subject: [PATCH 17/58] [Bugfix] when use s3 model cannot use default
 load_format (#24435)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 vllm/config/__init__.py  | 12 ++++++++++++
 vllm/engine/arg_utils.py |  1 -
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 64be2f38c6a31..631618d427d42 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3029,6 +3029,18 @@ class VllmConfig:
                 SequenceClassificationConfig)
             SequenceClassificationConfig.verify_and_update_config(self)
 
+        if hasattr(self.model_config, "model_weights") and is_runai_obj_uri(
+                self.model_config.model_weights):
+            if self.load_config.load_format == "auto":
+                logger.info("Detected Run:ai model config. "
+                            "Overriding `load_format` to 'runai_streamer'")
+                self.load_config.load_format = "runai_streamer"
+            elif self.load_config.load_format != "runai_streamer":
+                raise ValueError(f"To load a model from S3, 'load_format' "
+                                 f"must be 'runai_streamer', "
+                                 f"but got '{self.load_config.load_format}'. "
+                                 f"Model: {self.model_config.model}")
+
     def __str__(self):
         return (
             f"model={self.model_config.model!r}, "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4831cb5348c77..e2a1ec68e6f53 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -959,7 +959,6 @@ class EngineArgs:
         if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
                 and self.model in MODELS_ON_S3 and self.load_format == "auto"):
             self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
-            self.load_format = "runai_streamer"
 
         if self.disable_mm_preprocessor_cache:
             logger.warning(

From ef7eefe17a7dc212ddb8a8aabd7760218a10e25e Mon Sep 17 00:00:00 2001
From: Tao He <linzhu.ht@alibaba-inc.com>
Date: Thu, 18 Sep 2025 16:16:04 +0800
Subject: [PATCH 18/58] [Qwen] Add fp8 checkpoint support for qwen3-next.
 (#25079)

Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
---
 vllm/model_executor/models/qwen3_next.py     | 35 ++++++++++----------
 vllm/model_executor/models/qwen3_next_mtp.py |  8 +++--
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index ca9f4d402dac2..eb060cb90f44c 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -30,7 +30,6 @@ from vllm.model_executor.layers.layernorm import (
     GemmaRMSNorm as Qwen3NextRMSNorm)
 # yapf: enable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
@@ -254,12 +253,20 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         # projection of the input hidden states
         self.projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2
         self.projection_size_ba = self.num_v_heads * 2
-        self.in_proj = MergedColumnParallelLinear(
+        self.in_proj_qkvz = ColumnParallelLinear(
             input_size=self.hidden_size,
-            output_sizes=[self.projection_size_qkvz, self.projection_size_ba],
+            output_size=self.projection_size_qkvz,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.in_proj",
+            prefix=f"{prefix}.in_proj_qkvz",
+        )
+        # ba_proj doesn't support blockwise fp8 quantization.
+        self.in_proj_ba = ColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_size=self.projection_size_ba,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.in_proj_ba",
         )
 
         query_key_settings = (self.key_dim, 0, False)
@@ -420,19 +427,14 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
         ssm_state = self_kv_cache[1]
         num_actual_tokens = attn_metadata.num_actual_tokens
         num_accepted_tokens = attn_metadata.num_accepted_tokens
-
-        # 1. Set up dimensions for reshapes later
-        projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens])
         if spec_token_masks is not None:
             spec_token_masks = spec_token_masks[:num_actual_tokens]
-        projected_states_qkvz, projected_states_ba = torch.split(
-            projected_states,
-            [
-                self.projection_size_qkvz // self.tp_size,
-                self.projection_size_ba // self.tp_size
-            ],
-            dim=-1,
-        )
+
+        # 1. Set up dimensions for reshapes later
+        projected_states_qkvz, _ = self.in_proj_qkvz(
+            hidden_states[:num_actual_tokens])
+        projected_states_ba, _ = self.in_proj_ba(
+            hidden_states[:num_actual_tokens])
         query, key, value, z, b, a = self.fix_query_key_value_ordering(
             projected_states_qkvz, projected_states_ba)
         query, key, value = map(lambda x: rearrange(x, 'l p d -> l (p d)'),
@@ -976,8 +978,6 @@ class Qwen3NextModel(nn.Module):
             ("qkv_proj", "v_proj", "v"),
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
-            ("in_proj", "in_proj_qkvz", 0),
-            ("in_proj", "in_proj_ba", 1),
         ]
 
         params_dict = dict(self.named_parameters())
@@ -1055,7 +1055,6 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
             "v_proj",
         ],
         "gate_up_proj": ["gate_proj", "up_proj"],
-        "in_proj": ["in_proj_qkvz", "in_proj_ba"],
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py
index 190a1750e673a..c755eeb9b4eaa 100644
--- a/vllm/model_executor/models/qwen3_next_mtp.py
+++ b/vllm/model_executor/models/qwen3_next_mtp.py
@@ -63,7 +63,9 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
                                        self.config.hidden_size,
                                        gather_output=True,
                                        bias=False,
-                                       return_bias=False)
+                                       return_bias=False,
+                                       quant_config=quant_config,
+                                       prefix=f'{prefix}.fc')
 
         self.layers = torch.nn.ModuleList(
             Qwen3NextDecoderLayer(
@@ -72,7 +74,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
                 model_config=model_config,
                 cache_config=cache_config,
                 quant_config=quant_config,
-                prefix=f'{prefix}.layers.{self.mtp_start_layer_idx + idx}',
+                prefix=f'{prefix}.layers.{idx}',
             ) for idx in range(self.num_mtp_layers))
 
         self.make_empty_intermediate_tensors = (
@@ -233,7 +235,7 @@ class Qwen3NextMTP(nn.Module, SupportsPP):
         self.config = config
         self.model = Qwen3NextMultiTokenPredictor(vllm_config=vllm_config,
                                                   prefix=maybe_prefix(
-                                                      prefix, "model"))
+                                                      prefix, "mtp"))
         self.unpadded_vocab_size = config.vocab_size
         self.lm_head = ParallelLMHead(self.unpadded_vocab_size,
                                       config.hidden_size,

From aa3f105c591a506523804e12800adcca80480bd8 Mon Sep 17 00:00:00 2001
From: Gerard Finol <gerardfinol@gmail.com>
Date: Thu, 18 Sep 2025 11:02:14 +0200
Subject: [PATCH 19/58] Add 'path' option to ImagePrompt data_format (#25081)

Signed-off-by: Gerard Finol <gerard.finol@urv.cat>
---
 .../prithvi_io_processor_plugin/prithvi_io_processor/types.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
index d480aef704c61..d4c6628211fb2 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
@@ -22,7 +22,7 @@ class DataModuleConfig(TypedDict):
 
 class ImagePrompt(BaseModel):
 
-    data_format: Literal["b64_json", "bytes", "url"]
+    data_format: Literal["b64_json", "bytes", "url", "path"]
     """
     This is the data type for the input image
     """

From 05b044e698bb3c151871d94b64fabd87188de9ef Mon Sep 17 00:00:00 2001
From: Punitvara <punitvara@gmail.com>
Date: Thu, 18 Sep 2025 14:35:16 +0530
Subject: [PATCH 20/58] [Doc] Fix cross-reference warnings (#25058)

Signed-off-by: Punit Vara <punitvara@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/benchmarks/datasets.py                          |  3 ++-
 .../device_communicators/shm_object_storage.py       |  8 ++++----
 .../model_executor/layers/mamba/ops/causal_conv1d.py | 12 +++++++-----
 vllm/model_executor/models/mistral3.py               |  2 +-
 vllm/multimodal/profiling.py                         |  2 +-
 vllm/v1/core/kv_cache_manager.py                     |  5 +++--
 6 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 1831539a6adbe..1cab40802c392 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -171,7 +171,8 @@ class BenchmarkDataset(ABC):
                 If `None`, LoRA is not used.
 
         Returns:
-            A new [LoRARequest][] (or `None` if not applicable).
+            A new [`LoRARequest`][vllm.lora.request.LoRARequest]
+            (or `None` if not applicable).
         """
         if max_loras is None or lora_path is None:
             return None
diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py
index 3fac104bda1e8..352e7525d4c84 100644
--- a/vllm/distributed/device_communicators/shm_object_storage.py
+++ b/vllm/distributed/device_communicators/shm_object_storage.py
@@ -30,7 +30,7 @@ class SingleWriterShmRingBuffer:
     - Maintains metadata for each allocated buffer chunk in the writer process
     - Supports custom "is_free_fn" functions to determine when buffers can be
       reused
-    - Each buffer chunk contains: [4-byte id][4-byte size][actual_data]
+    - Each buffer chunk contains: `[4-byte id][4-byte size][actual_data]`
     
     Key Concepts:
     - monotonic_id_start/end: Track the range of active buffer IDs
@@ -99,7 +99,7 @@ class SingleWriterShmRingBuffer:
     - Writer handles garbage collection (free_buf) based on reader feedback
     
     Memory Layout per Buffer Chunk:
-    [4-byte monotonic_id][4-byte chunk_size][actual_data...]
+    `[4-byte monotonic_id][4-byte chunk_size][actual_data...]`
     ^metadata_start                         ^data_start
     
     The monotonic_id ensures data integrity - readers can verify they're
@@ -185,7 +185,7 @@ class SingleWriterShmRingBuffer:
         '''
         Allocate a buffer `MD_SIZE` + `size` bytes in the shared memory.
         Memory layout:
-        [4-byte monotonic_id][4-byte size][buffer data...]
+        `[4-byte monotonic_id][4-byte size][buffer data...]`
         '''
         assert self.is_writer, "Only the writer can allocate buffers."
         assert size > 0, "Size must be greater than 0"
@@ -413,7 +413,7 @@ class SingleWriterShmObjectStorage:
       allocation
 
     Memory Layout per Object:
-    [4-byte reference_count][metadata_size][serialized_object_data]
+    `[4-byte reference_count][metadata_size][serialized_object_data]`
     
     Thread Safety:
     - Writer operations (put, clear) are single-threaded by design
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index 7e3ea561fd293..2a88fa661da01 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -927,11 +927,13 @@ def causal_conv1d_update(
     validate_data=False,
 ):
     """
-    x: (batch, dim) or (batch, dim, seqlen) or (num_tokens, dim)
-        [shape=2: single token prediction]
-        [shape=3: single or multiple tokens prediction]
-        [shape=2 with num_tokens: continuous batching, where num_tokens is the
-                                  total tokens of all sequences in that batch]
+    x: Input tensor which can take the following shapes:
+
+    - `[batch, dim]` - single token prediction
+    - `[batch, dim, seqlen]` - single or multiple tokens prediction
+    - `[num_tokens, dim]` - continuous batching, where num_tokens is
+        the total tokens of all sequences in that batch
+
     conv_state: (..., dim, state_len), where state_len >= width - 1
     weight: (dim, width)
     bias: (dim,)
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 09479012a03ad..d15776a39362d 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -583,7 +583,7 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
             inputs_embeds: Optional tensor of input embeddings.
 
         Info:
-            [Mistral3ImagePixelInputs][]
+            [`Mistral3ImagePixelInputs`][vllm.model_executor.models.mistral3.Mistral3ImagePixelInputs]
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index bad6c0c3d9db2..fbbc55d3524ca 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -301,7 +301,7 @@ class MultiModalProfiler(Generic[_I]):
         Returns the maximum length of the multimodal (image placeholders+text)
         tokens, including any break/text tokens in-between image embeddings.
 
-        <im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end>
+        `<im_start> [IMG] [IMG] [IMG] <row_break> [IMG] [IMG] [IMG] <im_end>`
         Returns 9, even when the number of image embeddings is 6.
         
         This is important to take into account when profiling and
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 3a0fbb5e5c41e..401327f727a4a 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -24,8 +24,9 @@ class KVCacheBlocks:
     """
     blocks: tuple[list[KVCacheBlock], ...]
     """
-    blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens.
-    We don't use block of tokens as the outer dimension because it assumes all
+    `blocks[i][j]` refers to the i-th kv_cache_group
+    and the j-th block of tokens.We don't use block of
+    tokens as the outer dimension because it assumes all
     kv_cache_groups have the same number of blocks, which is true for now but 
     will be broken if we want to give different block_size to different 
     kv_cache_groups in the future.

From 29283e89762a3d572c504e5ea317351696b553a6 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Thu, 18 Sep 2025 05:20:27 -0400
Subject: [PATCH 21/58] [Chore] Cleanup guided namespace, move to structured
 outputs config (#22772)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       |   6 -
 .buildkite/test-pipeline.yaml                 |   3 +-
 .github/mergify.yml                           |   2 +-
 .../benchmark_serving_structured_output.py    |  16 +-
 docs/api/README.md                            |   2 +-
 docs/features/reasoning_outputs.md            |  10 +-
 docs/features/structured_outputs.md           |  36 +--
 docs/features/tool_calling.md                 |  11 +-
 docs/serving/openai_compatible_server.md      |   4 +-
 .../offline_inference/structured_outputs.py   |  54 ++---
 ...t_completion_client_with_tools_required.py |   2 +-
 .../structured_outputs/structured_outputs.py  |   8 +-
 tests/entrypoints/conftest.py                 |   2 +-
 tests/entrypoints/llm/test_lazy_outlines.py   |  82 -------
 tests/entrypoints/openai/test_chat.py         | 123 +++++------
 tests/entrypoints/openai/test_completion.py   |  79 ++++---
 .../test_completion_with_function_calling.py  |   4 +-
 .../entrypoints/openai/test_openai_schema.py  |   8 +-
 .../openai/test_prompt_validation.py          |   2 +-
 tests/entrypoints/openai/test_serving_chat.py |   4 -
 .../openai/test_transcription_validation.py   |   2 +-
 .../openai/test_translation_validation.py     |   2 +-
 tests/test_sampling_params.py                 |  84 -------
 tests/tool_use/test_tool_choice_required.py   |  11 +-
 tests/v1/core/test_scheduler.py               |   6 +-
 tests/v1/engine/test_llm_engine.py            |   4 +-
 tests/v1/entrypoints/conftest.py              |   2 +-
 .../llm/test_struct_output_generate.py        | 135 ++++++------
 .../openai/test_chat_completion.py            |  14 +-
 .../v1/entrypoints/openai/test_completion.py  |  14 +-
 vllm/config/__init__.py                       |  35 +--
 vllm/engine/arg_utils.py                      |  95 ++++----
 vllm/engine/async_llm_engine.py               |   7 +-
 vllm/engine/llm_engine.py                     |  18 +-
 vllm/engine/protocol.py                       |   7 +-
 vllm/entrypoints/llm.py                       |  27 ++-
 vllm/entrypoints/openai/api_server.py         |  10 +-
 vllm/entrypoints/openai/protocol.py           | 206 ++++++------------
 vllm/entrypoints/openai/serving_chat.py       |   2 +-
 vllm/model_executor/models/config.py          |   6 +-
 vllm/sampling_params.py                       |  62 ++----
 vllm/transformers_utils/tokenizers/mistral.py |   5 +-
 vllm/v1/engine/async_llm.py                   |   3 -
 vllm/v1/engine/processor.py                   |  57 +++--
 vllm/v1/request.py                            |   2 +-
 vllm/v1/structured_output/__init__.py         |  13 +-
 vllm/v1/structured_output/backend_guidance.py |   4 +-
 .../backend_lm_format_enforcer.py             |  22 +-
 vllm/v1/structured_output/backend_outlines.py |  32 +--
 vllm/v1/structured_output/backend_xgrammar.py |  38 ++--
 vllm/v1/structured_output/request.py          |   2 +-
 51 files changed, 579 insertions(+), 806 deletions(-)
 delete mode 100644 tests/entrypoints/llm/test_lazy_outlines.py
 delete mode 100644 tests/test_sampling_params.py

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index c395011a24485..7f90181048d0f 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
   --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 
-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8dd99bf1a38f6..66dfc990805f2 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -108,8 +108,7 @@ steps:
   - tests/entrypoints/offline_mode
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
diff --git a/.github/mergify.yml b/.github/mergify.yml
index f2dd2e06214ae..94198b1251e09 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -171,7 +171,7 @@ pull_request_rules:
       - files=examples/online_serving/openai_chat_completion_structured_outputs.py
       - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
       - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_guided_generate.py
+      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
       - files~=^vllm/v1/structured_output/
   actions:
     label:
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 4aae755eb4e44..73b4aa5a87e07 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -696,11 +696,11 @@ def evaluate(ret, args):
         return re.match(args.regex, actual) is not None
 
     def _eval_correctness(expected, actual):
-        if args.structure_type == "guided_json":
+        if args.structure_type == "json":
             return _eval_correctness_json(expected, actual)
-        elif args.structure_type == "guided_regex":
+        elif args.structure_type == "regex":
             return _eval_correctness_regex(expected, actual)
-        elif args.structure_type == "guided_choice":
+        elif args.structure_type == "choice":
             return _eval_correctness_choice(expected, actual)
         else:
             return None
@@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
     )
 
     if args.dataset == "grammar":
-        args.structure_type = "guided_grammar"
+        args.structure_type = "grammar"
     elif args.dataset == "regex":
-        args.structure_type = "guided_regex"
+        args.structure_type = "regex"
     elif args.dataset == "choice":
-        args.structure_type = "guided_choice"
+        args.structure_type = "choice"
     else:
-        args.structure_type = "guided_json"
+        args.structure_type = "json"
 
     if args.no_structured_output:
         args.structured_output_ratio = 0
     if args.save_results:
-        result_file_name = f"{args.structured_output_ratio}guided"
+        result_file_name = f"{args.structured_output_ratio}so"
         result_file_name += f"_{backend}"
         result_file_name += f"_{args.request_rate}qps"
         result_file_name += f"_{args.model.split('/')[-1]}"
diff --git a/docs/api/README.md b/docs/api/README.md
index 57142e8f5625d..148211756480c 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.LoRAConfig][]
 - [vllm.config.MultiModalConfig][]
 - [vllm.config.PoolerConfig][]
-- [vllm.config.DecodingConfig][]
+- [vllm.config.StructuredOutputsConfig][]
 - [vllm.config.ObservabilityConfig][]
 - [vllm.config.KVTransferConfig][]
 - [vllm.config.CompilationConfig][]
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index d518e7f0cff43..85681669dfb22 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models:
 
 | Model Series | Parser Name | Structured Output Support | Tool Calling |
 |--------------|-------------|------------------|-------------|
-| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
-| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
+| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
-| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
-| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
-| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ |
+| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
+| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
+| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
 
 !!! note
     IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 0d6294a5fdd79..1f955c6e30d6c 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla
 
 The following parameters are supported, which must be added as extra parameters:
 
-- `guided_choice`: the output will be exactly one of the choices.
-- `guided_regex`: the output will follow the regex pattern.
-- `guided_json`: the output will follow the JSON schema.
-- `guided_grammar`: the output will follow the context free grammar.
+- `choice`: the output will be exactly one of the choices.
+- `regex`: the output will follow the regex pattern.
+- `json`: the output will follow the JSON schema.
+- `grammar`: the output will follow the context free grammar.
 - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
 
 You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
 
 Structured outputs are supported by default in the OpenAI-Compatible Server. You
 may choose to specify the backend to use by setting the
-`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
+`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
 which will try to choose an appropriate backend based on the details of the
 request. You may also choose a specific backend, along with
 some options. A full set of options is available in the `vllm serve --help`
 text.
 
-Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
+Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
 
 ??? code
 
@@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
         messages=[
             {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
         ],
-        extra_body={"guided_choice": ["positive", "negative"]},
+        extra_body={"structured_outputs": {"choice": ["positive", "negative"]}},
     )
     print(completion.choices[0].message.content)
     ```
 
-The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
+The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
 
 ??? code
 
@@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
                 "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
             }
         ],
-        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
+        extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
     )
     print(completion.choices[0].message.content)
     ```
 
 One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
-For this we can use the `guided_json` parameter in two different ways:
+For this we can use the `json` parameter in two different ways:
 
 - Using directly a [JSON Schema](https://json-schema.org/)
 - Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
 
-The next example shows how to use the `guided_json` parameter with a Pydantic model:
+The next example shows how to use the `response_format` parameter with a Pydantic model:
 
 ??? code
 
@@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
     JSON schema and how the fields should be populated. This can improve the
     results notably in most cases.
 
-Finally we have the `guided_grammar` option, which is probably the most
+Finally we have the `grammar` option, which is probably the most
 difficult to use, but it´s really powerful. It allows us to define complete
 languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:
@@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
                 "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
             }
         ],
-        extra_body={"guided_grammar": simplified_sql_grammar},
+        extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
     )
     print(completion.choices[0].message.content)
     ```
@@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
 ## Offline Inference
 
 Offline inference allows for the same types of structured outputs.
-To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
-The main available options inside `GuidedDecodingParams` are:
+To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
+The main available options inside `StructuredOutputsParams` are:
 
 - `json`
 - `regex`
@@ -309,12 +309,12 @@ shown below:
 
     ```python
     from vllm import LLM, SamplingParams
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
 
     llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
 
-    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+    structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
+    sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
     outputs = llm.generate(
         prompts="Classify this sentiment: vLLM is wonderful!",
         sampling_params=sampling_params,
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index a8c0db0a7ac13..2a48596571d1d 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -71,7 +71,7 @@ This example demonstrates:
 * Making a request with `tool_choice="auto"`
 * Handling the structured response and executing the corresponding function
 
-You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
+You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
 
 Remember that it's the caller's responsibility to:
 
@@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci
 
 ## Named Function Calling
 
-vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
-enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
+vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a
 high-quality one.
 
-vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
-For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
+vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
+For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
 
 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
 
 ## Required Function Calling
 
-vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
+vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
 
 When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 181a874efa3cb..bc52d02a50bd2 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -133,7 +133,7 @@ completion = client.chat.completions.create(
         {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
     ],
     extra_body={
-        "guided_choice": ["positive", "negative"]
+        "structured_outputs": {"choice": ["positive", "negative"]}
     }
 )
 ```
@@ -374,7 +374,7 @@ The following extra parameters are supported:
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
     ```
-  
+
 [](){ #translations-api }
 
 ### Translations API
diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 88d87beb4874d..6b6099f71b120 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-This file demonstrates the example usage of guided decoding
-to generate structured outputs using vLLM. It shows how to apply
-different guided decoding techniques such as Choice, Regex, JSON schema,
-and Grammar to produce structured and formatted results
-based on specific prompts.
+This file demonstrates the example usage of structured outputs
+in vLLM. It shows how to apply different constraints such as choice,
+regex, json schema, and grammar to produce structured and formatted
+results based on specific prompts.
 """
 
 from enum import Enum
@@ -13,19 +12,23 @@ from enum import Enum
 from pydantic import BaseModel
 
 from vllm import LLM, SamplingParams
-from vllm.sampling_params import GuidedDecodingParams
+from vllm.sampling_params import StructuredOutputsParams
 
 MAX_TOKENS = 50
 
-# Guided decoding by Choice (list of possible options)
-guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
-sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
+# Structured outputs by Choice (list of possible options)
+structured_outputs_params_choice = StructuredOutputsParams(
+    choice=["Positive", "Negative"]
+)
+sampling_params_choice = SamplingParams(
+    structured_outputs=structured_outputs_params_choice
+)
 prompt_choice = "Classify this sentiment: vLLM is wonderful!"
 
-# Guided decoding by Regex
-guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
+# Structured outputs by Regex
+structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
 sampling_params_regex = SamplingParams(
-    guided_decoding=guided_decoding_params_regex,
+    structured_outputs=structured_outputs_params_regex,
     stop=["\n"],
     max_tokens=MAX_TOKENS,
 )
@@ -36,7 +39,7 @@ prompt_regex = (
 )
 
 
-# Guided decoding by JSON using Pydantic schema
+# Structured outputs by JSON using Pydantic schema
 class CarType(str, Enum):
     sedan = "sedan"
     suv = "SUV"
@@ -51,17 +54,16 @@ class CarDescription(BaseModel):
 
 
 json_schema = CarDescription.model_json_schema()
-guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
+structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
 sampling_params_json = SamplingParams(
-    guided_decoding=guided_decoding_params_json,
-    max_tokens=MAX_TOKENS,
+    structured_outputs=structured_outputs_params_json, max_tokens=MAX_TOKENS
 )
 prompt_json = (
-    "Generate a JSON with the brand, model and car_type of"
+    "Generate a JSON with the brand, model and car_type of "
     "the most iconic car from the 90's"
 )
 
-# Guided decoding by Grammar
+# Structured outputs by Grammar
 simplified_sql_grammar = """
 root ::= select_statement
 select_statement ::= "SELECT " column " from " table " where " condition
@@ -70,13 +72,15 @@ table ::= "table_1 " | "table_2 "
 condition ::= column "= " number
 number ::= "1 " | "2 "
 """
-guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
+structured_outputs_params_grammar = StructuredOutputsParams(
+    grammar=simplified_sql_grammar
+)
 sampling_params_grammar = SamplingParams(
-    guided_decoding=guided_decoding_params_grammar,
+    structured_outputs=structured_outputs_params_grammar,
     max_tokens=MAX_TOKENS,
 )
 prompt_grammar = (
-    "Generate an SQL query to show the 'username' and 'email'from the 'users' table."
+    "Generate an SQL query to show the 'username' and 'email' from the 'users' table."
 )
 
 
@@ -93,16 +97,16 @@ def main():
     llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
 
     choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
-    format_output("Guided decoding by Choice", choice_output)
+    format_output("Structured outputs by Choice", choice_output)
 
     regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
-    format_output("Guided decoding by Regex", regex_output)
+    format_output("Structured outputs by Regex", regex_output)
 
     json_output = generate_output(prompt_json, sampling_params_json, llm)
-    format_output("Guided decoding by JSON", json_output)
+    format_output("Structured outputs by JSON", json_output)
 
     grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
-    format_output("Guided decoding by Grammar", grammar_output)
+    format_output("Structured outputs by Grammar", grammar_output)
 
 
 if __name__ == "__main__":
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
index 7eb8668213eef..6ff65b18f6674 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -6,7 +6,7 @@ without any specific flags:
 
 ```bash
 VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
-    --guided-decoding-backend outlines
+    --structured-outputs-config.backend outlines
 ```
 
 This example demonstrates how to generate chat completions
diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py
index 2a8f4637260c2..3ea6c73e90e8f 100644
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -86,7 +86,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
                 "content": "Classify this sentiment: vLLM is wonderful!",
             }
         ],
-        "extra_body": {"guided_choice": ["positive", "negative"]},
+        "extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
     },
     "regex": {
         "messages": [
@@ -96,7 +96,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
             }
         ],
         "extra_body": {
-            "guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
+            "structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
         },
     },
     "json": {
@@ -122,7 +122,8 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
             }
         ],
         "extra_body": {
-            "guided_grammar": """
+            "structured_outputs": {
+                "grammar": """
 root ::= select_statement
 
 select_statement ::= "SELECT " column " from " table " where " condition
@@ -135,6 +136,7 @@ condition ::= column "= " number
 
 number ::= "1 " | "2 "
 """,
+            }
         },
     },
     "structural_tag": {
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index c23eeee271869..da75806ccf4de 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -184,7 +184,7 @@ def sample_enum_json_schema():
 
 
 @pytest.fixture
-def sample_guided_choice():
+def sample_structured_outputs_choices():
     return [
         "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
         "Ruby", "Swift", "Kotlin"
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
deleted file mode 100644
index ac0b7e134c55a..0000000000000
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import sys
-from contextlib import nullcontext
-
-from vllm_test_utils import BlameResult, blame
-
-from vllm import LLM, SamplingParams
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.sampling_params import GuidedDecodingParams
-
-
-def run_normal():
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    # Create an LLM without guided decoding as a baseline.
-    llm = LLM(model="distilbert/distilgpt2",
-              enforce_eager=True,
-              gpu_memory_utilization=0.3)
-    outputs = llm.generate(prompts, sampling_params)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-    # Destroy the LLM object and free up the GPU memory.
-    del llm
-    cleanup_dist_env_and_memory()
-
-
-def run_xgrammar(sample_regex):
-    # Create an LLM with guided decoding enabled.
-    llm = LLM(model="distilbert/distilgpt2",
-              enforce_eager=True,
-              guided_decoding_backend="xgrammar",
-              gpu_memory_utilization=0.3)
-    prompt = f"Give an example IPv4 address with this regex: {sample_regex}"
-    guided_decoding = GuidedDecodingParams(regex=sample_regex)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=guided_decoding)
-    outputs = llm.generate(
-        prompts=[prompt] * 2,
-        sampling_params=sampling_params,
-        use_tqdm=True,
-    )
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-def test_lazy_outlines(sample_regex):
-    """If users don't use guided decoding, outlines should not be imported.
-    """
-    # make sure outlines is not imported
-    module_name = "outlines"
-    # In CI, we only check finally if the module is imported.
-    # If it is indeed imported, we can rerun the test with `use_blame=True`,
-    # which will trace every function call to find the first import location,
-    # and help find the root cause.
-    # We don't run it in CI by default because it is slow.
-    use_blame = False
-    context = blame(
-        lambda: module_name in sys.modules) if use_blame else nullcontext()
-    with context as result:
-        run_normal()
-        run_xgrammar(sample_regex)
-    if use_blame:
-        assert isinstance(result, BlameResult)
-        print(f"the first import location is:\n{result.trace_stack}")
-    assert module_name not in sys.modules, (
-        f"Module {module_name} is imported. To see the first"
-        f" import location, run the test with `use_blame=True`.")
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d5924b7b3ae34..a827f94cfbfe5 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# imports for guided decoding tests
+# imports for structured outputs tests
 import json
 from typing import Optional
 
@@ -480,10 +480,11 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  sample_guided_choice, is_v1_server: bool):
+async def test_structured_outputs_choice_chat(
+        client: openai.AsyncOpenAI, sample_structured_outputs_choices,
+        is_v1_server: bool):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("Structured outputs is only supported in v1 engine")
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -498,9 +499,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice))
+        extra_body=dict(
+            structured_outputs={"choice": sample_structured_outputs_choices}))
     choice1 = chat_completion.choices[0].message.content
-    assert choice1 in sample_guided_choice
+    assert choice1 in sample_structured_outputs_choices
 
     messages.append({"role": "assistant", "content": choice1})
     messages.append({
@@ -512,17 +514,19 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice))
+        extra_body=dict(
+            structured_outputs={"choice": sample_structured_outputs_choices}))
     choice2 = chat_completion.choices[0].message.content
-    assert choice2 in sample_guided_choice
+    assert choice2 in sample_structured_outputs_choices
     assert choice1 != choice2
 
 
 @pytest.mark.asyncio
-async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
-                                is_v1_server: bool):
+async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
+                                            sample_json_schema,
+                                            is_v1_server: bool):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("Structured outputs is only supported in v1 engine")
 
     messages = [{
         "role": "system",
@@ -538,7 +542,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema))
+        extra_body=dict(structured_outputs={"json": sample_json_schema}))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json1 = json.loads(message.content)
@@ -555,7 +559,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema))
+        extra_body=dict(structured_outputs={"json": sample_json_schema}))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json2 = json.loads(message.content)
@@ -565,10 +569,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
 
 
 @pytest.mark.asyncio
-async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
-                                 is_v1_server: bool):
+async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI,
+                                             sample_regex, is_v1_server: bool):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("Structured outputs is only supported in v1 engine")
 
     messages = [{
         "role": "system",
@@ -583,7 +587,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex))
+        extra_body=dict(structured_outputs={"regex": sample_regex}))
     ip1 = chat_completion.choices[0].message.content
     assert ip1 is not None
     assert re.fullmatch(sample_regex, ip1) is not None
@@ -594,7 +598,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex))
+        extra_body=dict(structured_outputs={"regex": sample_regex}))
     ip2 = chat_completion.choices[0].message.content
     assert ip2 is not None
     assert re.fullmatch(sample_regex, ip2) is not None
@@ -602,7 +606,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
 
 
 @pytest.mark.asyncio
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
+async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -614,17 +618,19 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
     }]
 
     with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=MODEL_NAME,
-                                                 messages=messages,
-                                                 extra_body=dict(guided_regex={
-                                                     1: "Python",
-                                                     2: "C++"
-                                                 }))
+        _ = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            extra_body=dict(
+                structured_outputs={"regex": {
+                    1: "Python",
+                    2: "C++"
+                }}))
 
 
 @pytest.mark.asyncio
-async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           sample_guided_choice):
+async def test_structured_outputs_choice_chat_logprobs(
+        client: openai.AsyncOpenAI, sample_structured_outputs_choices):
 
     messages = [{
         "role": "system",
@@ -641,7 +647,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(guided_choice=sample_guided_choice))
+        extra_body=dict(
+            structured_outputs={"choice": sample_structured_outputs_choices}))
 
     assert chat_completion.choices[0].logprobs is not None
     assert chat_completion.choices[0].logprobs.content is not None
@@ -663,10 +670,23 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
     }, {
         "role":
         "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
+        "content": ("Give an example JSON for an employee "
+                    "profile using the specified tool.")
     }]
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "dummy_function_name",
+            "description": "This is a dummy function",
+            "parameters": sample_json_schema
+        }
+    }]
+    tool_choice = {
+        "type": "function",
+        "function": {
+            "name": "dummy_function_name"
+        }
+    }
 
     # non-streaming
 
@@ -674,20 +694,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        tools=[{
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name",
-                "description": "This is a dummy function",
-                "parameters": sample_json_schema
-            }
-        }],
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name"
-            }
-        },
+        tools=tools,
+        tool_choice=tool_choice,
     )
     message = chat_completion.choices[0].message
     assert len(message.content) == 0
@@ -705,25 +713,12 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
 
     # streaming
 
-    stream = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_completion_tokens=1000,
-        tools=[{
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name",
-                "description": "This is a dummy function",
-                "parameters": sample_json_schema
-            }
-        }],
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name"
-            }
-        },
-        stream=True)
+    stream = await client.chat.completions.create(model=MODEL_NAME,
+                                                  messages=messages,
+                                                  max_completion_tokens=1000,
+                                                  tools=tools,
+                                                  tool_choice=tool_choice,
+                                                  stream=True)
 
     output = []
     finish_reason_count = 0
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 3650b15792575..0347513befe32 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# imports for guided decoding tests
+# imports for structured outputs tests
 import json
 import os
 from typing import Optional
@@ -23,8 +23,6 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 # technically these adapters use a different base model,
 # but we're not testing generation quality here
 
-GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
-
 
 @pytest.fixture(scope="module")
 def default_server_args(zephyr_lora_files):
@@ -595,12 +593,13 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str,
-                                      sample_json_schema, is_v1_server: bool):
+async def test_structured_outputs_json_completion(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+    is_v1_server: bool,
+):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("structured outputs is only supported in v1 engine")
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -609,8 +608,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
         n=3,
         temperature=1.0,
         max_tokens=500,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(structured_outputs=dict(json=sample_json_schema)))
 
     assert completion.id is not None
     assert len(completion.choices) == 3
@@ -620,12 +618,13 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str,
-                                       sample_regex, is_v1_server: bool):
+async def test_structured_outputs_regex_completion(
+    client: openai.AsyncOpenAI,
+    sample_regex,
+    is_v1_server: bool,
+):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("structured outputs is only supported in v1 engine")
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -633,8 +632,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
         n=3,
         temperature=1.0,
         max_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(structured_outputs=dict(regex=sample_regex)))
 
     assert completion.id is not None
     assert len(completion.choices) == 3
@@ -644,13 +642,13 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str,
-                                        sample_guided_choice,
-                                        is_v1_server: bool):
+async def test_structured_outputs_choice_completion(
+    client: openai.AsyncOpenAI,
+    sample_structured_outputs_choices,
+    is_v1_server: bool,
+):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("structured outputs is only supported in v1 engine")
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -658,20 +656,21 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI,
         n=2,
         temperature=1.0,
         max_tokens=10,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(structured_outputs=dict(
+            choice=sample_structured_outputs_choices)))
 
     assert completion.id is not None
     assert len(completion.choices) == 2
     for i in range(2):
-        assert completion.choices[i].text in sample_guided_choice
+        assert completion.choices[i].text in sample_structured_outputs_choices
 
 
 @pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI,
-                              sample_sql_statements, is_v1_server: bool):
+async def test_structured_outputs_grammar(client: openai.AsyncOpenAI,
+                                          sample_sql_statements,
+                                          is_v1_server: bool):
     if not is_v1_server:
-        pytest.skip("Guided grammar is only supported in v1 engine")
+        pytest.skip("grammar is only supported in v1 engine")
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -679,7 +678,8 @@ async def test_guided_grammar(client: openai.AsyncOpenAI,
                 "table_1 where it is equals to 1"),
         temperature=1.0,
         max_tokens=500,
-        extra_body=dict(guided_grammar=sample_sql_statements))
+        extra_body=dict(
+            structured_outputs=dict(grammar=sample_sql_statements), ))
 
     content = completion.choices[0].text
 
@@ -730,27 +730,26 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str,
-                                          sample_json_schema, sample_regex,
-                                          is_v1_server: bool):
+async def test_structured_outputs_type_error(client: openai.AsyncOpenAI,
+                                             sample_json_schema, sample_regex,
+                                             is_v1_server: bool):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("structured outputs is only supported in v1 engine")
 
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
             model=MODEL_NAME,
             prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
+            extra_body=dict(structured_outputs=dict(json=42)))
 
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
             model=MODEL_NAME,
             prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=sample_regex,
-                            guided_json=sample_json_schema))
+            extra_body=dict(structured_outputs=dict(
+                regex=sample_regex,
+                json=sample_json_schema,
+            )))
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 4ef5d4e8a699a..3649cefa9bf42 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -142,7 +142,7 @@ def server():  # noqa: F811
         "--dtype",
         "half",
         "--enable-auto-tool-choice",
-        "--guided-decoding-backend",
+        "--structured-outputs-config.backend",
         "xgrammar",
         "--tool-call-parser",
         "hermes",
@@ -225,7 +225,7 @@ def k2_server():  # noqa: F811
         "--dtype",
         "half",
         "--enable-auto-tool-choice",
-        "--guided-decoding-backend",
+        "--structured-outputs-config.backend",
         "xgrammar",
         "--tool-call-parser",
         "hermes",
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 11ed1c4a9ee4b..73f79ac28d110 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -102,12 +102,14 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
                                 if "custom" in tool_call:
                                     return False
 
-            # Sometimes guided_grammar is generated to be empty
+            # Sometimes structured_outputs.grammar is generated to be empty
             # Causing a server error in EBNF grammar parsing
             # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
-            guided_grammar = case.body.get("guided_grammar")
+            structured_outputs = case.body.get("structured_outputs", {})
+            grammar = structured_outputs.get("grammar") if isinstance(
+                structured_outputs, dict) else None
 
-            if guided_grammar == '':
+            if grammar == '':
                 # Allow None (will be handled as no grammar)
                 # But skip empty strings
                 return False
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index bfa3f983cd87e..bb4c633e5e502 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -3,7 +3,7 @@
 
 import io
 
-# imports for guided decoding tests
+# imports for structured outputs tests
 import openai
 import pybase64
 import pytest
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index de26fce854f5b..8e68699e5904a 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -333,7 +333,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
             "role": "user",
             "content": "what is 1+1?"
         }],
-        guided_decoding_backend="outlines",
     )
 
     with suppress(Exception):
@@ -378,7 +377,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
             "role": "user",
             "content": "what is 1+1?"
         }],
-        guided_decoding_backend="outlines",
     )
 
     with suppress(Exception):
@@ -433,7 +431,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
             "role": "user",
             "content": "what is 1+1?"
         }],
-        guided_decoding_backend="outlines",
     )
 
     with suppress(Exception):
@@ -489,7 +486,6 @@ async def test_serving_chat_could_load_correct_generation_config():
             "role": "user",
             "content": "what is 1+1?"
         }],
-        guided_decoding_backend="outlines",
     )
 
     with suppress(Exception):
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 6a3cdfdfc8081..23c99da97ad3a 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# imports for guided decoding tests
+# imports for structured outputs tests
 import io
 import json
 
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index f43b7a253d28d..eb7879927b9b6 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import io
-# imports for guided decoding tests
+# imports for structured outputs tests
 import json
 
 import httpx
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
deleted file mode 100644
index 7330f61e67689..0000000000000
--- a/tests/test_sampling_params.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for the SamplingParams class.
-"""
-
-import pytest
-
-from vllm import SamplingParams
-from vllm.config import ModelConfig
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-
-MODEL_NAME = "Qwen/Qwen1.5-7B"
-
-
-def test_max_tokens_none():
-    """max_tokens=None should be allowed"""
-    SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
-
-
-@pytest.fixture(scope="module")
-def model_config():
-    return ModelConfig(
-        MODEL_NAME,
-        seed=0,
-        dtype="float16",
-    )
-
-
-@pytest.fixture(scope="module")
-def default_max_tokens():
-    return 4096
-
-
-def test_sampling_params_from_request_with_no_guided_decoding_backend(
-        model_config, default_max_tokens):
-    # guided_decoding_backend is not present at request level
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        MODEL_NAME,
-        'response_format': {
-            'type': 'json_object',
-        },
-    })
-
-    sampling_params = request.to_sampling_params(
-        default_max_tokens,
-        model_config.logits_processor_pattern,
-    )
-    # we do not expect any backend to be present and the default
-    # guided_decoding_backend at engine level will be used.
-    assert sampling_params.guided_decoding.backend is None
-
-
-@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
-                         [("xgrammar", "xgrammar"), ("guidance", "guidance"),
-                          ("outlines", "outlines")])
-def test_sampling_params_from_request_with_guided_decoding_backend(
-        request_level_guided_decoding_backend: str, expected: str,
-        model_config, default_max_tokens):
-
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        MODEL_NAME,
-        'response_format': {
-            'type': 'json_object',
-        },
-        'guided_decoding_backend':
-        request_level_guided_decoding_backend,
-    })
-
-    sampling_params = request.to_sampling_params(
-        default_max_tokens,
-        model_config.logits_processor_pattern,
-    )
-    # backend correctly identified in resulting sampling_params
-    assert sampling_params.guided_decoding.backend == expected
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index e0ed221a93e12..130e9547bdccb 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -68,7 +68,7 @@ EXAMPLE_TOOLS = [
 def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
                        should_match: bool):
     self = MagicMock(tool_choice="required", tools=tools)
-    schema = ChatCompletionRequest._get_guided_json_from_tool(self)
+    schema = ChatCompletionRequest._get_json_schema_from_tool(self)
     assert isinstance(schema, dict)
 
     # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
@@ -218,7 +218,7 @@ VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
                 }
             }, {}], False),
     ])
-def test_guided_json(sample_output, should_match):
+def test_structured_outputs_json(sample_output, should_match):
     _compile_and_check(tools=TypeAdapter(
         list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
                        sample_output=sample_output,
@@ -273,8 +273,9 @@ def update_parameters_empty_dict(
 @pytest.mark.parametrize(
     "update_parameters",
     [update_parameters_none, update_parameters_empty_dict])
-def test_guided_json_without_parameters(sample_output, should_match,
-                                        update_parameters):
+def test_structured_outputs_json_without_parameters(sample_output,
+                                                    should_match,
+                                                    update_parameters):
     updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
     tools = TypeAdapter(
         list[ChatCompletionToolsParam]).validate_python(updated_tools)
@@ -334,4 +335,4 @@ def test_streaming_output_valid(output, empty_params, delta_len):
             combined_messages += message.tool_calls[0].function.arguments
     combined_messages += "}]"
     assert json.loads(combined_messages) == output
-    assert json.dumps(json.loads(combined_messages)) == output_json
\ No newline at end of file
+    assert json.dumps(json.loads(combined_messages)) == output_json
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 572d6c9c889f6..f6fc1e6d37d14 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -10,7 +10,7 @@ from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import (MultiModalFeatureSpec,
                                     MultiModalKwargsItem, PlaceholderRange)
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -1796,11 +1796,11 @@ def test_schedule_skip_tokenizer_init():
 
 def test_schedule_skip_tokenizer_init_structured_output_request():
     scheduler = create_scheduler(skip_tokenizer_init=True)
-    guided_params = GuidedDecodingParams(regex="[0-9]+")
+    structured_outputs_params = StructuredOutputsParams(regex="[0-9]+")
     sampling_params = SamplingParams(
         ignore_eos=False,
         max_tokens=16,
-        guided_decoding=guided_params,
+        structured_outputs=structured_outputs_params,
     )
     request = Request(
         request_id="0",
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 2848420c22085..7529c3780ec25 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Optional
 import pytest
 
 from vllm import LLM
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
 
 if TYPE_CHECKING:
@@ -97,7 +97,7 @@ def _get_test_sampling_params(
             top_p=0.95,
             n=n,
             seed=seed,
-            guided_decoding=GuidedDecodingParams(
+            structured_outputs=StructuredOutputsParams(
                 regex="[0-9]+") if structured_outputs else None,
         ) for n in n_list
     ], n_list
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index ffe0612124660..46b953fe37433 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -151,7 +151,7 @@ def sample_definition_json_schema():
 
 
 @pytest.fixture
-def sample_guided_choice():
+def sample_structured_outputs_choices():
     return [
         "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
         "Ruby", "Swift", "Kotlin"
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index ad62914195b44..4b0f3b2d9967e 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -15,12 +15,13 @@ import torch
 from pydantic import BaseModel
 
 from tests.reasoning.utils import run_reasoning_extraction
+from vllm.config import StructuredOutputsConfig
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 
 if TYPE_CHECKING:
     from vllm.config import TokenizerMode
@@ -90,7 +91,7 @@ def _load_json(s: str, backend: str) -> str:
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
-    "model_name, guided_decoding_backend, tokenizer_mode, speculative_config",
+    "model_name, backend, tokenizer_mode, speculative_config",
     PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
 def test_structured_output(
     monkeypatch: pytest.MonkeyPatch,
@@ -99,8 +100,8 @@ def test_structured_output(
     sample_sql_ebnf: str,
     sample_sql_lark: str,
     sample_regex: str,
-    sample_guided_choice: str,
-    guided_decoding_backend: str,
+    sample_structured_outputs_choices: str,
+    backend: str,
     tokenizer_mode: str,
     model_name: str,
     speculative_config: dict[str, Any],
@@ -115,16 +116,15 @@ def test_structured_output(
     enforce_eager = bool(not current_platform.is_tpu())
     # Use a single LLM instance for several scenarios to
     # speed up the test suite.
-    llm = LLM(
-        model=model_name,
-        enforce_eager=enforce_eager,
-        max_model_len=1024,
-        guided_decoding_backend=guided_decoding_backend,
-        guided_decoding_disable_any_whitespace=(guided_decoding_backend
-                                                in {"xgrammar", "guidance"}),
-        seed=120,
-        tokenizer_mode=tokenizer_mode,
-        speculative_config=speculative_config)
+    llm = LLM(model=model_name,
+              enforce_eager=enforce_eager,
+              max_model_len=1024,
+              structured_outputs_config=dict(backend=backend,
+                                             disable_any_whitespace=backend
+                                             in {"xgrammar", "guidance"}),
+              seed=120,
+              tokenizer_mode=tokenizer_mode,
+              speculative_config=speculative_config)
 
     #
     # Test 1: Generate JSON output based on a provided schema
@@ -132,7 +132,7 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
+        structured_outputs=StructuredOutputsParams(json=sample_json_schema))
 
     prompt = ("Give an example JSON for an employee profile that fits this "
               "schema. Make the response as short as possible. Schema: "
@@ -152,7 +152,7 @@ def test_structured_output(
 
         generated_text = output.outputs[0].text
         assert generated_text is not None
-        if guided_decoding_backend != 'lm-format-enforcer':
+        if backend != 'lm-format-enforcer':
             assert "\n" not in generated_text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
@@ -161,12 +161,12 @@ def test_structured_output(
     #
     # Test 2: Generate JSON object without a schema
     #
-    if guided_decoding_backend != "outlines":
+    if backend != "outlines":
         sampling_params = SamplingParams(
             temperature=1.0,
             max_tokens=4096,
             n=2,
-            guided_decoding=GuidedDecodingParams(json_object=True))
+            structured_outputs=StructuredOutputsParams(json_object=True))
 
         outputs = llm.generate(prompts=(
             "Generate a JSON object with curly braces for a person with "
@@ -195,8 +195,9 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
-    if guided_decoding_backend.startswith("xgrammar"):
+        structured_outputs=StructuredOutputsParams(
+            json=unsupported_json_schema))
+    if backend.startswith("xgrammar"):
         with pytest.raises(ValueError,
                            match="The provided JSON schema contains features "
                            "not supported by xgrammar."):
@@ -230,7 +231,7 @@ def test_structured_output(
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
 
-    if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
+    if backend not in ["outlines", "lm-format-enforcer"]:
         #
         # Test 4: Generate SQL statement using EBNF grammar
         #
@@ -238,7 +239,8 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
+            structured_outputs=StructuredOutputsParams(
+                grammar=sample_sql_ebnf))
         outputs = llm.generate(
             ("Generate a sql statement that selects col_1 from "
              "table_1 where it is equal to 1. Make the response as short as "
@@ -271,7 +273,8 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
+            structured_outputs=StructuredOutputsParams(
+                grammar=sample_sql_lark))
         outputs = llm.generate(
             ("Generate a sql statement that selects col_1 from "
              "table_1 where it is equal to 1. Make the response as short as "
@@ -309,7 +312,8 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
+            structured_outputs=StructuredOutputsParams(
+                grammar="not a grammar"))
         with pytest.raises(ValueError, match="Failed to convert the grammar "):
             llm.generate(
                 ("Generate a sql statement that selects col_1 from "
@@ -325,7 +329,7 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+        structured_outputs=StructuredOutputsParams(regex=sample_regex))
 
     prompt = (f"Give an example IPv4 address with this regex: {sample_regex}. "
               f"Make the response as short as possible.")
@@ -352,7 +356,8 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
-        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+        structured_outputs=StructuredOutputsParams(
+            choice=sample_structured_outputs_choices))
 
     outputs = llm.generate(
         ("The best language for type-safe systems programming is "
@@ -368,7 +373,7 @@ def test_structured_output(
         generated_text = output.outputs[0].text
         print(generated_text)
         assert generated_text is not None
-        assert generated_text in sample_guided_choice
+        assert generated_text in sample_structured_outputs_choices
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
     #
@@ -378,7 +383,7 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=json_schema))
+        structured_outputs=StructuredOutputsParams(json=json_schema))
 
     outputs = llm.generate(
         ("Generate a JSON with the brand, model and car_type of the most "
@@ -422,7 +427,7 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=json_schema))
+        structured_outputs=StructuredOutputsParams(json=json_schema))
 
     outputs = llm.generate(
         ("Generate a description of a frog using 50 characters. "
@@ -444,7 +449,7 @@ def test_structured_output(
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=json_schema)
 
-    if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]:
+    if backend not in ["outlines", "lm-format-enforcer"]:
         #
         # Test 11: Generate structured output using structural_tag format
         #
@@ -470,7 +475,7 @@ def test_structured_output(
         sampling_params = SamplingParams(
             temperature=0.0,
             max_tokens=4096,
-            guided_decoding=GuidedDecodingParams(
+            structured_outputs=StructuredOutputsParams(
                 structural_tag=json.dumps(structural_tag_config)))
 
         prompt = """
@@ -547,7 +552,7 @@ Make the response as short as possible.
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
-    "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
+    "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
     [
         ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto",
          "deepseek_r1", NGRAM_SPEC_CONFIG),
@@ -556,7 +561,7 @@ Make the response as short as possible.
 )
 def test_structured_output_with_reasoning_matrices(
     monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
+    backend: str,
     tokenizer_mode: TokenizerMode,
     reasoning_parser: str,
     model_name: str,
@@ -576,10 +581,11 @@ def test_structured_output_with_reasoning_matrices(
         enforce_eager=bool(not current_platform.is_tpu()),
         max_model_len=1024,
         max_num_seqs=16,
-        guided_decoding_backend=guided_decoding_backend,
-        guided_decoding_disable_any_whitespace=True,
+        structured_outputs_config=dict(backend=backend,
+                                       disable_any_whitespace=backend
+                                       in {"xgrammar", "guidance"},
+                                       reasoning_parser=reasoning_parser),
         tokenizer_mode=tokenizer_mode,
-        reasoning_parser=reasoning_parser,
         speculative_config=speculative_config,
     )
     tokenizer = llm.get_tokenizer()
@@ -603,7 +609,7 @@ def test_structured_output_with_reasoning_matrices(
     sampling_params = SamplingParams(
         temperature=0.1,
         max_tokens=8192,
-        guided_decoding=GuidedDecodingParams(json=reasoning_schema),
+        structured_outputs=StructuredOutputsParams(json=reasoning_schema),
     )
     outputs = llm.generate(
         [reasoning_prompt],
@@ -640,13 +646,14 @@ def test_structured_output_auto_mode(
 
     llm = LLM(model=model_name,
               max_model_len=1024,
-              guided_decoding_backend="auto",
+              structured_outputs_config=dict(backend="auto"),
               tokenizer_mode=tokenizer_mode)
 
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+        structured_outputs=StructuredOutputsParams(
+            json=unsupported_json_schema))
 
     prompts = (
         "Give an example JSON object for a grade "
@@ -681,9 +688,10 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
 
     llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
               max_model_len=1024,
-              guided_decoding_backend="guidance",
-              guided_decoding_disable_any_whitespace=True,
-              guided_decoding_disable_additional_properties=True)
+              structured_outputs_config=dict(
+                  backend="guidance",
+                  disable_any_whitespace=True,
+                  disable_additional_properties=True))
 
     schema = {
         'type': 'object',
@@ -709,14 +717,15 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
         "<|im_end|>\n<|im_start|>assistant\n")
 
     def generate_with_backend(backend):
-        guided_params = GuidedDecodingParams(
+        structured_outputs_params = StructuredOutputsParams(
             json=schema,
             backend=backend,
             disable_any_whitespace=True,
             disable_additional_properties=True)
-        sampling_params = SamplingParams(temperature=0,
-                                         max_tokens=256,
-                                         guided_decoding=guided_params)
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=256,
+            structured_outputs=structured_outputs_params)
 
         outputs = llm.generate(prompt, sampling_params=sampling_params)
         assert outputs is not None
@@ -736,12 +745,11 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
     assert "a6" not in generated
 
 
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["guidance", "xgrammar", "outlines"])
-def test_structured_output_batched_with_non_guided_requests(
+@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
+def test_structured_output_batched_with_non_structured_outputs_requests(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
-    guided_decoding_backend: str,
+    backend: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
 
@@ -753,24 +761,25 @@ def test_structured_output_batched_with_non_guided_requests(
         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
         enforce_eager=enforce_eager,
         max_model_len=1024,
-        guided_decoding_backend=guided_decoding_backend,
-        guided_decoding_disable_any_whitespace=(guided_decoding_backend
-                                                in {"xgrammar", "guidance"}),
+        structured_outputs_config=StructuredOutputsConfig(
+            backend=backend,
+            disable_any_whitespace=backend in {"xgrammar", "guidance"},
+        ),
     )
 
-    guided_prompt = (
+    structured_outputs_prompt = (
         "Give an example JSON for an employee profile that fits this "
         "schema. Make the response as short as possible. Schema: "
         f"{sample_json_schema}")
 
-    non_guided_prompt = "The diameter of the Earth in kilometers is "
+    non_structured_outputs_prompt = "The diameter of the Earth in kilometers is "
 
-    prompts = [guided_prompt, non_guided_prompt]
+    prompts = [structured_outputs_prompt, non_structured_outputs_prompt]
     sampling_params = [
-        SamplingParams(
-            temperature=1.0,
-            max_tokens=400,
-            guided_decoding=GuidedDecodingParams(json=sample_json_schema)),
+        SamplingParams(temperature=1.0,
+                       max_tokens=400,
+                       structured_outputs=StructuredOutputsParams(
+                           json=sample_json_schema)),
         # No max tokens, temp=0 to assert on contents
         SamplingParams(
             seed=42,
@@ -801,16 +810,16 @@ def test_structured_output_batched_with_non_guided_requests(
         print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}")
 
         if index == 0:
-            # First prompt is guided, expect valid JSON
+            # First prompt is structured outputs, expect valid JSON
             assert "\n" not in generated_text
             output_json = json.loads(generated_text)
             jsonschema.validate(instance=output_json,
                                 schema=sample_json_schema)
         else:
-            # Second prompt is not guided, expect valid output
+            # Second prompt is not structured outputs, expect valid output
             # Cannot assert on exact output, but we can expect it to be factual
             assert "12,742" in generated_text
 
-            # non-guided requests should not return a valid JSON here
+            # non-structured outputs requests should not return a valid JSON here
             with pytest.raises(ValueError):
                 output_json = json.loads(generated_text)
diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py
index dffb32846c05e..9aa285aa9b18d 100644
--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
@@ -77,7 +77,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
                 "role": "user",
                 "content": prompt,
             }],
-            extra_body={"guided_json": invalid_json_schema},
+            extra_body={"structured_outputs": {
+                "json": invalid_json_schema
+            }},
         )
 
 
@@ -99,7 +101,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
                 "content": prompt,
             }],
             extra_body={
-                "guided_regex": r"[.*",
+                "structured_outputs": {
+                    "regex": r"[.*"
+                },
                 "stop": ["\n"]
             },
         )
@@ -134,5 +138,9 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
                 "role": "user",
                 "content": prompt,
             }],
-            extra_body={"guided_grammar": invalid_simplified_sql_grammar},
+            extra_body={
+                "structured_outputs": {
+                    "grammar": invalid_simplified_sql_grammar
+                }
+            },
         )
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 3114d7639f045..9090beb4bbd2a 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -627,7 +627,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
         await client.completions.create(
             model=model_name,
             prompt=prompt,
-            extra_body={"guided_json": invalid_json_schema},
+            extra_body={"structured_outputs": {
+                "json": invalid_json_schema
+            }},
         )
 
 
@@ -646,7 +648,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
             model=model_name,
             prompt=prompt,
             extra_body={
-                "guided_regex": r"[.*",
+                "structured_outputs": {
+                    "regex": r"[.*"
+                },
                 "stop": ["\n"]
             },
         )
@@ -678,7 +682,11 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
         await client.completions.create(
             model=model_name,
             prompt=prompt,
-            extra_body={"guided_grammar": invalid_simplified_sql_grammar},
+            extra_body={
+                "structured_outputs": {
+                    "grammar": invalid_simplified_sql_grammar
+                }
+            },
         )
 
 
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 631618d427d42..9a1c5f0b0d453 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2277,34 +2277,34 @@ def get_served_model_name(model: str,
     return served_model_name
 
 
-GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines",
-                                "lm-format-enforcer"]
+StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines",
+                                   "lm-format-enforcer"]
 
 
 @config
 @dataclass
-class DecodingConfig:
-    """Dataclass which contains the decoding strategy of the engine."""
+class StructuredOutputsConfig:
+    """Dataclass which contains structured outputs config for the engine."""
 
-    backend: GuidedDecodingBackend = "auto"
-    """Which engine will be used for guided decoding (JSON schema / regex etc)
-    by default. With "auto", we will make opinionated choices based on request
-    contents and what the backend libraries currently support, so the behavior
-    is subject to change in each release."""
+    backend: StructuredOutputsBackend = "auto"
+    """Which engine will be used for structured outputs (e.g. JSON schema,
+    regex, etc) by default. With "auto", we will make opinionated choices
+    based on request contents and what the backend libraries currently support,
+    so the behavior is subject to change in each release."""
 
     disable_fallback: bool = False
     """If `True`, vLLM will not fallback to a different backend on error."""
 
     disable_any_whitespace: bool = False
-    """If `True`, the model will not generate any whitespace during guided
-    decoding. This is only supported for xgrammar and guidance backends."""
+    """If `True`, the model will not generate any whitespace during structured
+    outputs. This is only supported for xgrammar and guidance backends."""
 
     disable_additional_properties: bool = False
     """If `True`, the `guidance` backend will not use `additionalProperties`
     in the JSON schema. This is only supported for the `guidance` backend and
     is used to better align its behaviour with `outlines` and `xgrammar`."""
 
-    reasoning_backend: str = ""
+    reasoning_parser: str = ""
     """Select the reasoning parser depending on the model that you're using.
     This is used to parse the reasoning content into OpenAI API format."""
 
@@ -2451,8 +2451,9 @@ class VllmConfig:
     """LoRA configuration."""
     speculative_config: Optional[SpeculativeConfig] = None
     """Speculative decoding configuration."""
-    decoding_config: DecodingConfig = field(default_factory=DecodingConfig)
-    """Decoding configuration."""
+    structured_outputs_config: StructuredOutputsConfig = field(
+        default_factory=StructuredOutputsConfig)
+    """Structured outputs configuration."""
     observability_config: Optional[ObservabilityConfig] = None
     """Observability configuration."""
     quant_config: Optional[QuantizationConfig] = None
@@ -2543,8 +2544,8 @@ class VllmConfig:
             vllm_factors.append(self.speculative_config.compute_hash())
         else:
             vllm_factors.append("None")
-        if self.decoding_config:
-            vllm_factors.append(self.decoding_config.compute_hash())
+        if self.structured_outputs_config:
+            vllm_factors.append(self.structured_outputs_config.compute_hash())
         else:
             vllm_factors.append("None")
         if self.observability_config:
@@ -3063,7 +3064,7 @@ class VllmConfig:
             f"enforce_eager={self.model_config.enforce_eager}, "
             f"kv_cache_dtype={self.cache_config.cache_dtype}, "
             f"device_config={self.device_config.device}, "
-            f"decoding_config={self.decoding_config!r}, "
+            f"structured_outputs_config={self.structured_outputs_config!r}, "
             f"observability_config={self.observability_config!r}, "
             f"seed={self.model_config.seed}, "
             f"served_model_name={self.model_config.served_model_name}, "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e2a1ec68e6f53..fb5beab77b270 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -22,17 +22,16 @@ from typing_extensions import TypeIs, deprecated
 
 import vllm.envs as envs
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
-                         ConfigType, ConvertOption, DecodingConfig,
-                         DetailedTraceModules, Device, DeviceConfig,
-                         DistributedExecutorBackend, EPLBConfig,
-                         GuidedDecodingBackend, HfOverrides, KVEventsConfig,
+                         ConfigType, ConvertOption, DetailedTraceModules,
+                         Device, DeviceConfig, DistributedExecutorBackend,
+                         EPLBConfig, HfOverrides, KVEventsConfig,
                          KVTransferConfig, LoadConfig, LogprobsMode,
                          LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
                          ModelDType, ModelImpl, ObservabilityConfig,
                          ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
                          RunnerOption, SchedulerConfig, SchedulerPolicy,
-                         SpeculativeConfig, TaskOption, TokenizerMode,
-                         VllmConfig, get_attr_docs)
+                         SpeculativeConfig, StructuredOutputsConfig,
+                         TaskOption, TokenizerMode, VllmConfig, get_attr_docs)
 from vllm.config.multimodal import MMCacheType, MultiModalConfig
 from vllm.config.parallel import ExpertPlacementStrategy
 from vllm.config.utils import get_field
@@ -418,12 +417,15 @@ class EngineArgs:
     disable_hybrid_kv_cache_manager: bool = (
         SchedulerConfig.disable_hybrid_kv_cache_manager)
 
-    guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
-    guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
-    guided_decoding_disable_any_whitespace: bool = \
-        DecodingConfig.disable_any_whitespace
-    guided_decoding_disable_additional_properties: bool = \
-        DecodingConfig.disable_additional_properties
+    structured_outputs_config: StructuredOutputsConfig = get_field(
+        VllmConfig, "structured_outputs_config")
+    reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
+    # Deprecated guided decoding fields
+    guided_decoding_backend: Optional[str] = None
+    guided_decoding_disable_fallback: Optional[bool] = None
+    guided_decoding_disable_any_whitespace: Optional[bool] = None
+    guided_decoding_disable_additional_properties: Optional[bool] = None
+
     logits_processor_pattern: Optional[
         str] = ModelConfig.logits_processor_pattern
 
@@ -462,7 +464,6 @@ class EngineArgs:
 
     additional_config: dict[str, Any] = \
         get_field(VllmConfig, "additional_config")
-    reasoning_parser: str = DecodingConfig.reasoning_backend
 
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
     pt_load_map_location: str = LoadConfig.pt_load_map_location
@@ -618,28 +619,29 @@ class EngineArgs:
         load_group.add_argument('--pt-load-map-location',
                                 **load_kwargs["pt_load_map_location"])
 
-        # Guided decoding arguments
-        guided_decoding_kwargs = get_kwargs(DecodingConfig)
-        guided_decoding_group = parser.add_argument_group(
-            title="DecodingConfig",
-            description=DecodingConfig.__doc__,
+        # Structured outputs arguments
+        structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig)
+        structured_outputs_group = parser.add_argument_group(
+            title="StructuredOutputsConfig",
+            description=StructuredOutputsConfig.__doc__,
         )
-        guided_decoding_group.add_argument("--guided-decoding-backend",
-                                           **guided_decoding_kwargs["backend"])
-        guided_decoding_group.add_argument(
-            "--guided-decoding-disable-fallback",
-            **guided_decoding_kwargs["disable_fallback"])
-        guided_decoding_group.add_argument(
-            "--guided-decoding-disable-any-whitespace",
-            **guided_decoding_kwargs["disable_any_whitespace"])
-        guided_decoding_group.add_argument(
-            "--guided-decoding-disable-additional-properties",
-            **guided_decoding_kwargs["disable_additional_properties"])
-        guided_decoding_group.add_argument(
+        structured_outputs_group.add_argument(
             "--reasoning-parser",
             # This choice is a special case because it's not static
             choices=list(ReasoningParserManager.reasoning_parsers),
-            **guided_decoding_kwargs["reasoning_backend"])
+            **structured_outputs_kwargs["reasoning_parser"])
+        # Deprecated guided decoding arguments
+        for arg, type in [
+            ("--guided-decoding-backend", str),
+            ("--guided-decoding-disable-fallback", bool),
+            ("--guided-decoding-disable-any-whitespace", bool),
+            ("--guided-decoding-disable-additional-properties", bool),
+        ]:
+            structured_outputs_group.add_argument(
+                arg,
+                type=type,
+                help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
+                deprecated=True)
 
         # Parallel arguments
         parallel_kwargs = get_kwargs(ParallelConfig)
@@ -934,6 +936,8 @@ class EngineArgs:
                                 **vllm_kwargs["compilation_config"])
         vllm_group.add_argument("--additional-config",
                                 **vllm_kwargs["additional_config"])
+        vllm_group.add_argument('--structured-outputs-config',
+                                **vllm_kwargs["structured_outputs_config"])
 
         # Other arguments
         parser.add_argument('--disable-log-stats',
@@ -1421,14 +1425,25 @@ class EngineArgs:
 
         load_config = self.create_load_config()
 
-        decoding_config = DecodingConfig(
-            backend=self.guided_decoding_backend,
-            disable_fallback=self.guided_decoding_disable_fallback,
-            disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
-            disable_additional_properties=\
-                self.guided_decoding_disable_additional_properties,
-            reasoning_backend=self.reasoning_parser
-        )
+        # Pass reasoning_parser into StructuredOutputsConfig
+        if self.reasoning_parser:
+            self.structured_outputs_config.reasoning_parser = \
+                self.reasoning_parser
+
+        # Forward the deprecated CLI args to the StructuredOutputsConfig
+        so_config = self.structured_outputs_config
+        if self.guided_decoding_backend is not None:
+            so_config.guided_decoding_backend = \
+            self.guided_decoding_backend
+        if self.guided_decoding_disable_fallback is not None:
+            so_config.guided_decoding_disable_fallback = \
+            self.guided_decoding_disable_fallback
+        if self.guided_decoding_disable_any_whitespace is not None:
+            so_config.guided_decoding_disable_any_whitespace = \
+            self.guided_decoding_disable_any_whitespace
+        if self.guided_decoding_disable_additional_properties is not None:
+            so_config.guided_decoding_disable_additional_properties = \
+            self.guided_decoding_disable_additional_properties
 
         observability_config = ObservabilityConfig(
             show_hidden_metrics_for_version=(
@@ -1446,7 +1461,7 @@ class EngineArgs:
             lora_config=lora_config,
             speculative_config=speculative_config,
             load_config=load_config,
-            decoding_config=decoding_config,
+            structured_outputs_config=self.structured_outputs_config,
             observability_config=observability_config,
             compilation_config=self.compilation_config,
             kv_transfer_config=self.kv_transfer_config,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 1ae82c9f6f6f9..6793041abc502 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -10,9 +10,8 @@ from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
 from weakref import ReferenceType
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig,
+from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig, VllmConfig)
-from vllm.config.lora import LoRAConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
@@ -955,10 +954,6 @@ class AsyncLLMEngine(EngineClient):
         """Get the parallel configuration of the vLLM engine."""
         return self.engine.get_parallel_config()
 
-    async def get_decoding_config(self) -> DecodingConfig:
-        """Get the decoding configuration of the vLLM engine."""
-        return self.engine.get_decoding_config()
-
     async def get_scheduler_config(self) -> SchedulerConfig:
         """Get the scheduling configuration of the vLLM engine."""
         return self.engine.get_scheduler_config()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 34b5dcb587503..708f3bbeeff15 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -16,9 +16,8 @@ import torch
 from typing_extensions import TypeVar
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig,
+from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig,
                          ParallelConfig, SchedulerConfig, VllmConfig)
-from vllm.config.lora import LoRAConfig
 from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase, Stats
@@ -213,8 +212,7 @@ class LLMEngine:
         self.device_config = vllm_config.device_config
         self.speculative_config = vllm_config.speculative_config  # noqa
         self.load_config = vllm_config.load_config
-        self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
-        )
+        self.structured_outputs_config = vllm_config.structured_outputs_config
         self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
         )
 
@@ -364,10 +362,9 @@ class LLMEngine:
                 self.observability_config.otlp_traces_endpoint)
 
         # Initialize reasoning parser if reasoning backend is set.
-        if self.decoding_config.reasoning_backend and \
-                self.tokenizer:
+        if self.structured_outputs_config.reasoning_parser and self.tokenizer:
             reasoner_class = ReasoningParserManager.get_reasoning_parser(
-                self.decoding_config.reasoning_backend)
+                self.structured_outputs_config.reasoning_parser)
             self.reasoner: ReasoningParser = reasoner_class(
                 self.tokenizer.get_lora_tokenizer())
 
@@ -381,7 +378,8 @@ class LLMEngine:
                 self.seq_counter,
                 stop_checker=StopChecker(
                     self.scheduler_config.max_model_len,
-                    self.reasoner if self.decoding_config.reasoning_backend
+                    self.reasoner
+                    if self.structured_outputs_config.reasoning_parser
                     and self.tokenizer else None,
                 ),
             ))
@@ -772,10 +770,6 @@ class LLMEngine:
         """Gets the parallel configuration."""
         return self.parallel_config
 
-    def get_decoding_config(self) -> DecodingConfig:
-        """Gets the decoding configuration."""
-        return self.decoding_config
-
     def get_scheduler_config(self) -> SchedulerConfig:
         """Gets the scheduler configuration."""
         return self.scheduler_config
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 808d2d0ce3d28..c345f17e6614f 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
 from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
-from vllm.config import DecodingConfig, ModelConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
@@ -248,11 +248,6 @@ class EngineClient(ABC):
         """Get the model configuration of the vLLM engine."""
         ...
 
-    @abstractmethod
-    async def get_decoding_config(self) -> DecodingConfig:
-        """Get the decoding configuration of the vLLM engine."""
-        ...
-
     @abstractmethod
     async def get_input_preprocessor(self) -> InputPreprocessor:
         """Get the input processor of the vLLM engine."""
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f2264292fa660..63e9478612bb1 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -15,8 +15,8 @@ import vllm.envs as envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence,
                               create_sort_beams_key_function)
-from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
-                         is_init_field)
+from vllm.config import (CompilationConfig, ModelDType,
+                         StructuredOutputsConfig, TokenizerMode, is_init_field)
 from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
                                    PoolerConfig, RunnerOption)
 from vllm.engine.llm_engine import LLMEngine
@@ -192,6 +192,8 @@ class LLM:
         hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
         override_pooler_config: Optional[PoolerConfig] = None,
+        structured_outputs_config: Optional[Union[dict[
+            str, Any], StructuredOutputsConfig]] = None,
         kv_cache_memory_bytes: Optional[int] = None,
         compilation_config: Optional[Union[int, dict[str, Any],
                                            CompilationConfig]] = None,
@@ -236,14 +238,30 @@ class LLM:
                 compilation_config_instance = CompilationConfig(
                     level=compilation_config)
             elif isinstance(compilation_config, dict):
-                predicate = lambda x: is_init_field(CompilationConfig, x[0])
                 compilation_config_instance = CompilationConfig(
-                    **dict(filter(predicate, compilation_config.items())))
+                    **{
+                        k: v
+                        for k, v in compilation_config.items()
+                        if is_init_field(CompilationConfig, k)
+                    })
             else:
                 compilation_config_instance = compilation_config
         else:
             compilation_config_instance = CompilationConfig()
 
+        if structured_outputs_config is not None:
+            if isinstance(structured_outputs_config, dict):
+                structured_outputs_instance = StructuredOutputsConfig(
+                    **{
+                        k: v
+                        for k, v in structured_outputs_config.items()
+                        if is_init_field(StructuredOutputsConfig, k)
+                    })
+            else:
+                structured_outputs_instance = structured_outputs_config
+        else:
+            structured_outputs_instance = StructuredOutputsConfig()
+
         engine_args = EngineArgs(
             model=model,
             runner=runner,
@@ -271,6 +289,7 @@ class LLM:
             hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
+            structured_outputs_config=structured_outputs_instance,
             compilation_config=compilation_config_instance,
             logits_processors=logits_processors,
             **kwargs,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index c07e95e9370a0..93ea846f26f6c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1678,7 +1678,7 @@ async def init_app_state(
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
         tool_server=tool_server,
-        reasoning_parser=args.reasoning_parser,
+        reasoning_parser=args.structured_outputs_config.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
         enable_log_outputs=args.enable_log_outputs,
@@ -1697,7 +1697,7 @@ async def init_app_state(
         exclude_tools_when_tool_choice_none=args.
         exclude_tools_when_tool_choice_none,
         tool_parser=args.tool_call_parser,
-        reasoning_parser=args.reasoning_parser,
+        reasoning_parser=args.structured_outputs_config.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
         enable_log_outputs=args.enable_log_outputs,
@@ -1800,10 +1800,10 @@ def validate_api_server_args(args):
                        f"(chose from {{ {','.join(valid_tool_parses)} }})")
 
     valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
-    if args.reasoning_parser \
-        and args.reasoning_parser not in valid_reasoning_parses:
+    if ((reasoning_parser := args.structured_outputs_config.reasoning_parser)
+            and reasoning_parser not in valid_reasoning_parses):
         raise KeyError(
-            f"invalid reasoning parser: {args.reasoning_parser} "
+            f"invalid reasoning parser: {reasoning_parser} "
             f"(chose from {{ {','.join(valid_reasoning_parses)} }})")
 
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 2505e493625d8..cff4a45fdc43e 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -54,8 +54,8 @@ from vllm.entrypoints.score_utils import (ScoreContentPartParam,
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
-                                  RequestOutputKind, SamplingParams)
+from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
+                                  SamplingParams, StructuredOutputsParams)
 from vllm.utils import random_uuid, resolve_obj_by_qualname
 
 logger = init_logger(__name__)
@@ -373,11 +373,12 @@ class ResponsesRequest(OpenAIBaseModel):
         stop_token_ids = default_sampling_params.get("stop_token_ids")
 
         # Structured output
-        guided_decoding = None
+        structured_outputs = None
         if self.text is not None and self.text.format is not None:
             response_format = self.text.format
-            if response_format.type == "json_schema":
-                guided_decoding = GuidedDecodingParams.from_optional(
+            if (response_format.type == "json_schema"
+                    and response_format.schema_ is not None):
+                structured_outputs = StructuredOutputsParams(
                     json=response_format.schema_)
             elif response_format.type == "json_object":
                 raise NotImplementedError("json_object is not supported")
@@ -392,7 +393,7 @@ class ResponsesRequest(OpenAIBaseModel):
             stop_token_ids=stop_token_ids,
             output_kind=(RequestOutputKind.DELTA
                          if self.stream else RequestOutputKind.FINAL_ONLY),
-            guided_decoding=guided_decoding,
+            structured_outputs=structured_outputs,
         )
 
     def is_include_output_logprobs(self) -> bool:
@@ -547,42 +548,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
-    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+    structured_outputs: Optional[StructuredOutputsParams] = Field(
         default=None,
-        description=("If specified, the output will follow the JSON schema."),
-    )
-    guided_regex: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, the output will follow the regex pattern."),
-    )
-    guided_choice: Optional[list[str]] = Field(
-        default=None,
-        description=(
-            "If specified, the output will be exactly one of the choices."),
-    )
-    guided_grammar: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, the output will follow the context free grammar."),
-    )
-    structural_tag: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, the output will follow the structural tag schema."),
-    )
-    guided_decoding_backend: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, will override the default guided decoding backend "
-            "of the server for this specific request. If set, must be either "
-            "'outlines' / 'lm-format-enforcer'"),
-    )
-    guided_whitespace_pattern: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, will override the default whitespace pattern "
-            "for guided json decoding."),
+        description="Additional kwargs for structured outputs",
     )
     priority: int = Field(
         default=0,
@@ -701,31 +669,33 @@ class ChatCompletionRequest(OpenAIBaseModel):
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        guided_json_object = None
-        if self.response_format is not None:
-            if self.response_format.type == "json_object":
-                guided_json_object = True
-            elif self.response_format.type == "json_schema":
-                json_schema = self.response_format.json_schema
-                assert json_schema is not None
-                self.guided_json = json_schema.json_schema
-            elif self.response_format.type == "structural_tag":
-                structural_tag = self.response_format
-                assert structural_tag is not None and isinstance(
-                    structural_tag, StructuralTagResponseFormat)
-                s_tag_obj = structural_tag.model_dump(by_alias=True)
-                self.structural_tag = json.dumps(s_tag_obj)
+        response_format = self.response_format
+        json_schema_from_tool = self._get_json_schema_from_tool()
+        if response_format is not None or json_schema_from_tool is not None:
+            # If structured outputs wasn't already enabled,
+            # we must enable it for these features to work
+            if self.structured_outputs is None:
+                self.structured_outputs = StructuredOutputsParams()
 
-        guided_decoding = GuidedDecodingParams.from_optional(
-            json=self._get_guided_json_from_tool() or self.guided_json,
-            regex=self.guided_regex,
-            choice=self.guided_choice,
-            grammar=self.guided_grammar,
-            json_object=guided_json_object,
-            backend=self.guided_decoding_backend,
-            whitespace_pattern=self.guided_whitespace_pattern,
-            structural_tag=self.structural_tag,
-        )
+            # Set structured output params for response format
+            if response_format is not None:
+                if response_format.type == "json_object":
+                    self.structured_outputs.json_object = True
+                elif response_format.type == "json_schema":
+                    json_schema = response_format.json_schema
+                    assert json_schema is not None
+                    self.structured_outputs.json = json_schema.json_schema
+                elif response_format.type == "structural_tag":
+                    structural_tag = response_format
+                    assert structural_tag is not None and isinstance(
+                        structural_tag, StructuralTagResponseFormat)
+                    s_tag_obj = structural_tag.model_dump(by_alias=True)
+                    self.structured_outputs.structural_tag = json.dumps(
+                        s_tag_obj)
+
+            # Set structured output params for tool calling
+            if json_schema_from_tool is not None:
+                self.structured_outputs.json = json_schema_from_tool
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -757,15 +727,14 @@ class ChatCompletionRequest(OpenAIBaseModel):
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
-            guided_decoding=guided_decoding,
+            structured_outputs=self.structured_outputs,
             logit_bias=self.logit_bias,
-            bad_words= self.bad_words,
+            bad_words=self.bad_words,
             allowed_token_ids=self.allowed_token_ids,
             extra_args=extra_args or None,
         )
 
-    def _get_guided_json_from_tool(
-            self) -> Optional[Union[str, dict, BaseModel]]:
+    def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]:
         # user has chosen to not use any tool
         if self.tool_choice == "none" or self.tools is None:
             return None
@@ -875,28 +844,31 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     @model_validator(mode="before")
     @classmethod
-    def check_guided_decoding_count(cls, data):
+    def check_structured_outputs_count(cls, data):
         if isinstance(data, ValueError):
             raise data
 
-        guide_count = sum([
-            "guided_json" in data and data["guided_json"] is not None,
-            "guided_regex" in data and data["guided_regex"] is not None,
-            "guided_choice" in data and data["guided_choice"] is not None
-        ])
-        # you can only use one kind of guided decoding
-        if guide_count > 1:
+        if "structured_outputs" not in data:
+            return data
+
+        structured_outputs_kwargs = data['structured_outputs']
+        count = sum(
+            structured_outputs_kwargs.get(k) is not None
+            for k in ("json", "regex", "choice"))
+        # you can only use one kind of constraints for structured outputs
+        if count > 1:
             raise ValueError(
-                "You can only use one kind of guided decoding "
-                "('guided_json', 'guided_regex' or 'guided_choice').")
-        # you can only either use guided decoding or tools, not both
-        if guide_count > 1 and data.get("tool_choice", "none") not in (
+                "You can only use one kind of constraints for structured "
+                "outputs ('json', 'regex' or 'choice').")
+        # you can only either use structured outputs or tools, not both
+        if count > 1 and data.get("tool_choice", "none") not in (
                 "none",
                 "auto",
                 "required",
         ):
             raise ValueError(
-                "You can only either use guided decoding or tools, not both.")
+                "You can only either use constraints for structured outputs "
+                "or tools, not both.")
         return data
 
     @model_validator(mode="before")
@@ -1049,37 +1021,9 @@ class CompletionRequest(OpenAIBaseModel):
             ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
         ),
     )
-    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+    structured_outputs: Optional[StructuredOutputsParams] = Field(
         default=None,
-        description="If specified, the output will follow the JSON schema.",
-    )
-    guided_regex: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, the output will follow the regex pattern."),
-    )
-    guided_choice: Optional[list[str]] = Field(
-        default=None,
-        description=(
-            "If specified, the output will be exactly one of the choices."),
-    )
-    guided_grammar: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, the output will follow the context free grammar."),
-    )
-    guided_decoding_backend: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, will override the default guided decoding backend "
-            "of the server for this specific request. If set, must be one of "
-            "'outlines' / 'lm-format-enforcer'"),
-    )
-    guided_whitespace_pattern: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, will override the default whitespace pattern "
-            "for guided json decoding."),
+        description="Additional kwargs for structured outputs",
     )
     priority: int = Field(
         default=0,
@@ -1210,20 +1154,10 @@ class CompletionRequest(OpenAIBaseModel):
 
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        guided_json_object = None
-        if (self.response_format is not None
+        if (self.structured_outputs is not None
+                and self.response_format is not None
                 and self.response_format.type == "json_object"):
-            guided_json_object = True
-
-        guided_decoding = GuidedDecodingParams.from_optional(
-            json=self.guided_json,
-            regex=self.guided_regex,
-            choice=self.guided_choice,
-            grammar=self.guided_grammar,
-            json_object=guided_json_object,
-            backend=self.guided_decoding_backend,
-            whitespace_pattern=self.guided_whitespace_pattern,
-        )
+            self.structured_outputs.json_object = True
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -1255,7 +1189,7 @@ class CompletionRequest(OpenAIBaseModel):
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
-            guided_decoding=guided_decoding,
+            structured_outputs=self.structured_outputs,
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
             extra_args=extra_args or None,
@@ -1263,16 +1197,18 @@ class CompletionRequest(OpenAIBaseModel):
 
     @model_validator(mode="before")
     @classmethod
-    def check_guided_decoding_count(cls, data):
-        guide_count = sum([
-            "guided_json" in data and data["guided_json"] is not None,
-            "guided_regex" in data and data["guided_regex"] is not None,
-            "guided_choice" in data and data["guided_choice"] is not None
-        ])
-        if guide_count > 1:
+    def check_structured_outputs_count(cls, data):
+        if "structured_outputs" not in data:
+            return data
+
+        structured_outputs_kwargs = data['structured_outputs']
+        count = sum(
+            structured_outputs_kwargs.get(k) is not None
+            for k in ("json", "regex", "choice"))
+        if count > 1:
             raise ValueError(
-                "You can only use one kind of guided decoding "
-                "('guided_json', 'guided_regex' or 'guided_choice').")
+                "You can only use one kind of constraints for structured "
+                "outputs ('json', 'regex' or 'choice').")
         return data
 
     @model_validator(mode="before")
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index cd85baa9ba661..16564214e353a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -993,7 +993,7 @@ class OpenAIServingChat(OpenAIServing):
                         # check to make sure we haven't "forgotten" to stream
                         #   any tokens that were generated but previously
                         #   matched by partial json parsing
-                        # only happens if we are NOT using guided decoding
+                        # only happens if we are NOT using structured outputs
                         auto_tools_called = False
                         if tool_parser:
                             auto_tools_called = len(
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 687af7a189cea..ce3d23763ed64 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -262,9 +262,9 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
 
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        decoding_config = vllm_config.decoding_config
-        if decoding_config.reasoning_backend == "":
-            decoding_config.reasoning_backend = "openai_gptoss"
+        structured_outputs_config = vllm_config.structured_outputs_config
+        if structured_outputs_config.reasoning_parser == "":
+            structured_outputs_config.reasoning_parser = "openai_gptoss"
 
         # Increase the max capture size from 512 to 1024 for performance.
         # NOTE(woosuk): This will increase the number of CUDA graphs
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index fe93e906064e4..0a01cb0260ae5 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -2,13 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampling parameters for text generation."""
 import copy
-from dataclasses import dataclass
+from dataclasses import field
 from enum import Enum, IntEnum
 from functools import cached_property
 from typing import Annotated, Any, Optional, Union
 
 import msgspec
-from pydantic import BaseModel
+from pydantic.dataclasses import dataclass
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
@@ -28,60 +28,35 @@ class SamplingType(IntEnum):
 
 # maybe make msgspec?
 @dataclass
-class GuidedDecodingParams:
-    """One of these fields will be used to build a logit processor."""
+class StructuredOutputsParams:
+    # One of these fields will be used to build a logit processor.
     json: Optional[Union[str, dict]] = None
     regex: Optional[str] = None
     choice: Optional[list[str]] = None
     grammar: Optional[str] = None
     json_object: Optional[bool] = None
-    """These are other options that can be set"""
-    backend: Optional[str] = None
-    backend_was_auto: bool = False
+    # These are other options that can be set.
     disable_fallback: bool = False
     disable_any_whitespace: bool = False
     disable_additional_properties: bool = False
     whitespace_pattern: Optional[str] = None
     structural_tag: Optional[str] = None
 
-    @staticmethod
-    def from_optional(
-        json: Optional[Union[dict, BaseModel, str]] = None,
-        regex: Optional[str] = None,
-        choice: Optional[list[str]] = None,
-        grammar: Optional[str] = None,
-        json_object: Optional[bool] = None,
-        backend: Optional[str] = None,
-        whitespace_pattern: Optional[str] = None,
-        structural_tag: Optional[str] = None,
-    ) -> Optional["GuidedDecodingParams"]:
-        if all(arg is None for arg in (json, regex, choice, grammar,
-                                       json_object, structural_tag)):
-            return None
-        # Extract json schemas from pydantic models
-        if isinstance(json, (BaseModel, type(BaseModel))):
-            json = json.model_json_schema()
-        return GuidedDecodingParams(
-            json=json,
-            regex=regex,
-            choice=choice,
-            grammar=grammar,
-            json_object=json_object,
-            backend=backend,
-            whitespace_pattern=whitespace_pattern,
-            structural_tag=structural_tag,
-        )
+    _backend: Optional[str] = field(default=None, init=False)
+    """CAUTION: Should only be set by Processor._validate_structured_output"""
+    _backend_was_auto: bool = field(default=False, init=False)
+    """CAUTION: Should only be set by Processor._validate_structured_output"""
 
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
-        guide_count = sum([
+        count = sum([
             self.json is not None, self.regex is not None, self.choice
             is not None, self.grammar is not None, self.json_object is not None
         ])
-        if guide_count > 1:
+        if count > 1:
             raise ValueError(
-                "You can only use one kind of guided decoding but multiple are "
-                f"specified: {self.__dict__}")
+                "You can only use one kind of structured outputs constraint "
+                f"but multiple are specified: {self.__dict__}")
 
 
 class RequestOutputKind(Enum):
@@ -196,9 +171,8 @@ class SamplingParams(
     _all_stop_token_ids: set[int] = msgspec.field(default_factory=set)
 
     # Fields used to construct logits processors
-    guided_decoding: Optional[GuidedDecodingParams] = None
-    """If provided, the engine will construct a guided decoding logits
-    processor from these parameters."""
+    structured_outputs: Optional[StructuredOutputsParams] = None
+    """Parameters for configuring structured outputs."""
     logit_bias: Optional[dict[int, float]] = None
     """If provided, the engine will construct a logits processor that applies
     these logit biases."""
@@ -246,7 +220,7 @@ class SamplingParams(
                                                    msgspec.Meta(
                                                        ge=-1)]] = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
-        guided_decoding: Optional[GuidedDecodingParams] = None,
+        structured_outputs: Optional[StructuredOutputsParams] = None,
         logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
         allowed_token_ids: Optional[list[int]] = None,
         extra_args: Optional[dict[str, Any]] = None,
@@ -288,7 +262,7 @@ class SamplingParams(
             logits_processors=logits_processors,
             truncate_prompt_tokens=truncate_prompt_tokens,
             output_kind=output_kind,
-            guided_decoding=guided_decoding,
+            structured_outputs=structured_outputs,
             logit_bias=logit_bias,
             allowed_token_ids=allowed_token_ids,
             extra_args=extra_args,
@@ -559,7 +533,7 @@ class SamplingParams(
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
             f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
-            f"guided_decoding={self.guided_decoding}, "
+            f"structured_outputs={self.structured_outputs}, "
             f"extra_args={self.extra_args})")
 
 
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 5b07327cf2b81..d8a8d19391cd0 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -274,7 +274,7 @@ class MistralTokenizer(TokenizerBase):
         return tokenizer_file
 
     # the following attributes are set to fit vLLM's design and are used
-    # by the guided structured output backends.
+    # by the structured output backends.
     @property
     def all_special_tokens_extended(self) -> list[str]:
         from mistral_common.tokens.tokenizers.base import SpecialTokens
@@ -463,9 +463,6 @@ class MistralTokenizer(TokenizerBase):
 
         return decoded
 
-    # WARN: Outlines logits processors can overwrite this method.
-    # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
-    # for more.
     def decode(self,
                ids: Union[list[int], int],
                skip_special_tokens: bool = True) -> str:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index f17c269e4709e..73165c7e4c0ad 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -588,9 +588,6 @@ class AsyncLLM(EngineClient):
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
-    async def get_decoding_config(self):
-        raise ValueError("Not Supported on V1 yet.")
-
     async def get_input_preprocessor(self) -> InputPreprocessor:
         return self.processor.input_preprocessor
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 8d9f2ba1ec825..71f539583a1be 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -45,7 +45,7 @@ class Processor:
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
-        self.decoding_config = vllm_config.decoding_config
+        self.structured_outputs_config = vllm_config.structured_outputs_config
         self.tokenizer = tokenizer
 
         self.generation_config_fields = (
@@ -219,58 +219,57 @@ class Processor:
                 "[lora_path]` to use the LoRA tokenizer.")
 
     def _validate_structured_output(self, params: SamplingParams) -> None:
-        if not params.guided_decoding or not self.decoding_config:
+        if not params.structured_outputs or not self.structured_outputs_config:
             return
 
-        if self.model_config.skip_tokenizer_init and params.guided_decoding:
+        if self.model_config.skip_tokenizer_init and params.structured_outputs:
             raise ValueError(
                 "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
             )
 
-        engine_level_backend = self.decoding_config.backend
-        if params.guided_decoding.backend:
-            # Request-level backend selection is not supported in V1.
+        backend = self.structured_outputs_config.backend
+        if _backend := params.structured_outputs._backend:
+            # Request-level backend selection is not supported.
             # The values may differ if `params` is reused and was set
             # to a specific backend based on `auto` behavior in a previous
             # request. We remember that it was set as a result of `auto`
-            # using the `_auto` option set on the backend in the params.
-            if (params.guided_decoding.backend != engine_level_backend
-                    and not (engine_level_backend == "auto"
-                             and params.guided_decoding.backend_was_auto)):
+            # using the `_backend_was_auto` field set in the params.
+            if (backend != _backend
+                    and not (backend == "auto"
+                             and params.structured_outputs._backend_was_auto)):
                 raise ValueError(
-                    "Request-level structured output backend selection is no "
-                    "longer supported. The request specified "
-                    f"'{params.guided_decoding.backend}', but vLLM was "
-                    f"initialised with '{engine_level_backend}'. This error "
-                    "can be resolved by removing backend selection from the "
-                    "request.")
+                    "Request-level structured output backend selection is not "
+                    f"supported. The request specified '{_backend}', but vLLM "
+                    f"was initialised with '{backend}'. This error can be "
+                    "resolved by removing '_backend' from the request.")
         else:
-            params.guided_decoding.backend = engine_level_backend
+            params.structured_outputs._backend = backend
 
         # Request content validation
-        if (isinstance(params.guided_decoding.choice, list)
-                and not params.guided_decoding.choice):
+        if (isinstance(params.structured_outputs.choice, list)
+                and not params.structured_outputs.choice):
             # It is invalid for choice to be an empty list
-            raise ValueError(f"Choice '{params.guided_decoding.choice}' "
-                             "cannot be an empty list")
+            raise ValueError(
+                f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
+            )
 
-        if engine_level_backend.startswith("xgrammar"):
+        if backend.startswith("xgrammar"):
             # xgrammar with no fallback
             validate_xgrammar_grammar(params)
-        elif engine_level_backend.startswith("guidance"):
+        elif backend.startswith("guidance"):
             # TODO: ideally we would have the LLTokenizer here as Lark syntax
             # allows <|special_token|> and similar, see
             # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
             # Without tokenizer these are disallowed in grammars.
             validate_guidance_grammar(params, tokenizer=None)
-        elif engine_level_backend == "outlines":
+        elif backend == "outlines":
             # outlines backend
             validate_structured_output_request_outlines(params)
-        elif engine_level_backend == "lm-format-enforcer":
+        elif backend == "lm-format-enforcer":
             # lm format enforcer backend
             validate_structured_output_request_lm_format_enforcer(params)
         else:
-            # NOTE: engine_level_backend must be "auto" here, because we have
+            # NOTE: backend must be "auto" here, because we have
             # checked supported_backends above.
             # In this mode, we set opinionated defaults based on what we think
             # will satisfy the most use cases without having to worry about
@@ -278,15 +277,15 @@ class Processor:
             # other setting where a specific backend was specified.
             try:
                 validate_xgrammar_grammar(params)
-                params.guided_decoding.backend = "xgrammar"
+                params.structured_outputs._backend = "xgrammar"
             except ValueError:
                 # The request either failed validation
                 # or includes some jsonschema feature(s) that
                 # are not supported in xgrammar. Fall back to guidance.
                 validate_guidance_grammar(params, tokenizer=None)
-                params.guided_decoding.backend = "guidance"
+                params.structured_outputs._backend = "guidance"
             # Remember that this backend was set automatically
-            params.guided_decoding.backend_was_auto = True
+            params.structured_outputs._backend_was_auto = True
 
     def _maybe_build_mm_uuids(
         self,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 4e3e581235cce..145af788d2372 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -67,7 +67,7 @@ class Request:
             # Generative models.
             assert sampling_params.max_tokens is not None
             self.max_tokens = sampling_params.max_tokens
-            if sampling_params.guided_decoding is not None:
+            if sampling_params.structured_outputs is not None:
                 self.status = RequestStatus.WAITING_FOR_FSM
                 self.use_structured_output = True
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 1ab29dfecd9e4..13c33d3edf141 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -61,11 +61,11 @@ class StructuredOutputManager:
             self.executor = ThreadPoolExecutor(max_workers=max_workers)
             self.tokenizer = init_tokenizer_from_configs(
                 model_config=self.vllm_config.model_config)
-            reasoning_backend = \
-                    self.vllm_config.decoding_config.reasoning_backend
-            if reasoning_backend:
+            reasoning_parser = \
+                    self.vllm_config.structured_outputs_config.reasoning_parser
+            if reasoning_parser:
                 reasoner_cls = ReasoningParserManager.get_reasoning_parser(
-                    reasoning_backend)
+                    reasoning_parser)
                 self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
 
     def grammar_init(self, request: Request) -> None:
@@ -74,15 +74,16 @@ class StructuredOutputManager:
 
         if TYPE_CHECKING:
             assert request.sampling_params is not None and \
-                request.sampling_params.guided_decoding is not None
+                request.sampling_params.structured_outputs is not None
 
         # Initialize the backend the first time it is needed.
         #
         # NOTE: We only support a single backend. We do NOT support different
         # backends on a per-request basis in V1 (for now, anyway...).
+        # _backend is set in Processor._validate_structured_output
         if self.backend is None:
             assert request.sampling_params is not None
-            backend = request.sampling_params.guided_decoding.backend
+            backend = request.sampling_params.structured_outputs._backend
             vocab_size = self.vllm_config.model_config.get_vocab_size()
             if backend == "xgrammar":
                 self.backend = XgrammarBackend(
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 02e7fc33f517d..e06ab6377de3a 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -60,9 +60,9 @@ class GuidanceBackend(StructuredOutputBackend):
 
     def __post_init__(self):
         self.disable_any_whitespace = \
-            self.vllm_config.decoding_config.disable_any_whitespace
+            self.vllm_config.structured_outputs_config.disable_any_whitespace
         self.disable_additional_properties = \
-            self.vllm_config.decoding_config.disable_additional_properties
+            self.vllm_config.structured_outputs_config.disable_additional_properties
 
         self.ll_tokenizer = llguidance_hf.from_tokenizer(
             self.tokenizer, self.vocab_size)
diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py
index 2279a1c8c8a00..465b2428f8938 100644
--- a/vllm/v1/structured_output/backend_lm_format_enforcer.py
+++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py
@@ -138,30 +138,30 @@ class LMFormatEnforcerBackend(StructuredOutputBackend):
 
 def validate_structured_output_request_lm_format_enforcer(
         params: SamplingParams):
-    if params.guided_decoding is None:
+    if params.structured_outputs is None:
         return
 
-    gd_params = params.guided_decoding
+    so_params = params.structured_outputs
 
-    if gd_params.regex:
+    if so_params.regex:
         return
-    elif gd_params.json:
-        if isinstance(gd_params.json, str):
+    elif so_params.json:
+        if isinstance(so_params.json, str):
             try:
                 # make sure schema is valid json
-                json.loads(gd_params.json)
+                json.loads(so_params.json)
             except json.JSONDecodeError as e:
                 raise ValueError("Invalid JSON grammar specification.") from e
         else:
             try:
-                json.dumps(gd_params.json)
+                json.dumps(so_params.json)
             except Exception as e:
                 raise ValueError(
-                    f"Error serializing guided decoding jsonschema: {e}"
+                    f"Error serializing structured outputs jsonschema: {e}"
                 ) from e
         return
-    elif gd_params.choice:
+    elif so_params.choice:
         return
-    elif gd_params.grammar:
-        raise ValueError("LM Format Enforcer guided decoding backend "
+    elif so_params.grammar:
+        raise ValueError("LM Format Enforcer structured outputs backend "
                          "does not support grammar specifications")
diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py
index 572e4984480fa..e5e638a6ad764 100644
--- a/vllm/v1/structured_output/backend_outlines.py
+++ b/vllm/v1/structured_output/backend_outlines.py
@@ -158,36 +158,36 @@ class OutlinesGrammar(StructuredOutputGrammar):
 
 
 def validate_structured_output_request_outlines(params: SamplingParams):
-    if params.guided_decoding is None:
+    if params.structured_outputs is None:
         return
 
-    gd_params = params.guided_decoding
+    so_params = params.structured_outputs
 
-    if gd_params.regex:
-        validate_regex_is_buildable(gd_params.regex)
-    elif gd_params.json:
-        if isinstance(gd_params.json, str):
+    if so_params.regex:
+        validate_regex_is_buildable(so_params.regex)
+    elif so_params.json:
+        if isinstance(so_params.json, str):
             try:
                 # make sure schema is valid json
-                json.loads(gd_params.json)
-                schema = gd_params.json
+                json.loads(so_params.json)
+                schema = so_params.json
             except json.JSONDecodeError as e:
                 raise ValueError("Invalid JSON grammar specification.") from e
         else:
             try:
-                schema = json.dumps(gd_params.json)
+                schema = json.dumps(so_params.json)
             except Exception as e:
                 raise ValueError(
-                    f"Error serializing guided decoding jsonschema: {e}"
+                    f"Error serializing structured outputs jsonschema: {e}"
                 ) from e
         pattern = json_schema.build_regex_from_schema(schema)
         validate_regex_is_buildable(pattern)
-    elif gd_params.choice:
-        choices = [regex_escape(str(choice)) for choice in gd_params.choice]
+    elif so_params.choice:
+        choices = [regex_escape(str(choice)) for choice in so_params.choice]
         regex = "(" + "|".join(choices) + ")"
         validate_regex_is_buildable(regex)
-    elif gd_params.grammar:
-        raise ValueError("Outlines guided decoding backend "
+    elif so_params.grammar:
+        raise ValueError("Outlines structured outputs backend "
                          "does not support grammar specifications")
 
 
@@ -306,7 +306,7 @@ def validate_regex_is_buildable(pattern: str) -> None:
         _check_unsupported(parsed)
     except ValueError as e:
         raise ValueError(
-            f"Regex uses unsupported feature for guided decoding: {e}. "
+            f"Regex uses unsupported feature for structured outputs: {e}. "
             "Only basic matching constructs are supported—lookarounds, "
             "backreferences, and unicode boundaries are not.") from e
 
@@ -315,6 +315,6 @@ def validate_regex_is_buildable(pattern: str) -> None:
             "Regex does not have a anchored universal start state"
             "This means that the Regex uses anchors (^) or look-arounds "
             "in a way which requires context before any token is matched."
-            "Guided decoding needs regexes that can match without needing "
+            "structured outputs needs regexes that can match without needing "
             "that context. Try rewriting the pattern without using these "
             f"constructs. Pattern:\n{pattern}")
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 5e00f63804162..55b4792fe010d 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -34,7 +34,7 @@ class XgrammarBackend(StructuredOutputBackend):
 
     def __post_init__(self):
         self.disable_any_whitespace = \
-            self.vllm_config.decoding_config.disable_any_whitespace
+            self.vllm_config.structured_outputs_config.disable_any_whitespace
 
         if isinstance(self.tokenizer, MistralTokenizer):
             # NOTE: ideally, xgrammar should handle this accordingly.
@@ -248,37 +248,37 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
 
     Raises ValueError if the request is not supported.
     """
-    if sampling_params.guided_decoding is None:
+    if sampling_params.structured_outputs is None:
         return
 
-    gd_params = sampling_params.guided_decoding
+    so_params = sampling_params.structured_outputs
 
-    if gd_params.regex:
+    if so_params.regex:
         try:
-            xgr.Grammar.from_regex(gd_params.regex)
+            xgr.Grammar.from_regex(so_params.regex)
         except Exception as err:
             raise ValueError("Failed to transform regex into a grammar: "
                              f"{err}") from err
 
-    if gd_params.choice:
-        choice_grammar = choice_as_grammar(gd_params.choice)
+    if so_params.choice:
+        choice_grammar = choice_as_grammar(so_params.choice)
         try:
             xgr.Grammar.from_ebnf(choice_grammar)
         except Exception as err:
             raise ValueError("Failed to transform choices into a grammar: "
                              "{err}") from err
-        gd_params.choice = None
-        gd_params.grammar = choice_grammar
+        so_params.choice = None
+        so_params.grammar = choice_grammar
         return
 
-    if gd_params.json:
-        if isinstance(gd_params.json, str):
+    if so_params.json:
+        if isinstance(so_params.json, str):
             try:
-                schema = json.loads(gd_params.json)
+                schema = json.loads(so_params.json)
             except json.JSONDecodeError as e:
                 raise ValueError("Invalid JSON grammar specification.") from e
         else:
-            schema = gd_params.json
+            schema = so_params.json
 
         try:
             xgr.Grammar.from_json_schema(schema)
@@ -291,11 +291,11 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
                              "supported by xgrammar.")
         return
 
-    if gd_params.grammar:
-        if grammar_is_likely_lark(gd_params.grammar):
+    if so_params.grammar:
+        if grammar_is_likely_lark(so_params.grammar):
             # xgrammar supports EBNF grammars only
             try:
-                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
+                so_params.grammar = convert_lark_to_ebnf(so_params.grammar)
             except ValueError as e:
                 raise ValueError(
                     "Failed to convert the grammar from Lark to EBNF. ") from e
@@ -303,14 +303,14 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
         # Test parsing EBNF grammar, possibly already converted from Lark
         try:
             # parse the grammar, but we aren't compiling it.
-            xgr.Grammar.from_ebnf(gd_params.grammar)
+            xgr.Grammar.from_ebnf(so_params.grammar)
         except Exception as e:
             raise ValueError("Invalid grammar specification.") from e
         return
 
-    if gd_params.structural_tag:
+    if so_params.structural_tag:
         try:
-            s_tag = json.loads(gd_params.structural_tag)
+            s_tag = json.loads(so_params.structural_tag)
             tags = [
                 xgr.StructuralTagItem(
                     begin=s["begin"],
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index fc365f12573fc..99974ef46ecd5 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -60,7 +60,7 @@ class StructuredOutputRequest:
 
 def get_structured_output_key(
         sampling_params: SamplingParams) -> StructuredOutputKey:
-    params = sampling_params.guided_decoding
+    params = sampling_params.structured_outputs
     assert params is not None, "params can't be None."
     if params.json is not None:
         if not isinstance(params.json, str):

From 4f02b77de4e794a0d417ed98a26884208f75e043 Mon Sep 17 00:00:00 2001
From: ihb2032 <40718643+ihb2032@users.noreply.github.com>
Date: Thu, 18 Sep 2025 17:43:23 +0800
Subject: [PATCH 22/58] Fix: Add explicit #include <omp.h> for OpenMP
 compatibility on certain toolchains  (#24951)

Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
---
 csrc/cpu/cpu_types.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index 17bbe04eef94a..c3a21796881c9 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -17,4 +17,8 @@
   #warning "unsupported vLLM cpu implementation"
 #endif
 
+#ifdef _OPENMP
+  #include <omp.h>
+#endif
+
 #endif
\ No newline at end of file

From abdfcd4f3dc21dc162baf6887f658fb0f2f3d783 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Elvir=20Crn=C4=8Devi=C4=87?= <elvircrn@gmail.com>
Date: Thu, 18 Sep 2025 12:25:12 +0200
Subject: [PATCH 23/58] silu-v1: Fix EPS not being used during max-reduction
 (#25069)

Signed-off-by: elvircrn <elvircrn@gmail.com>
---
 csrc/quantization/activation_kernels.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu
index 9ddb5af3052fa..9aa1411b4a25c 100644
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -365,7 +365,6 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
   int32_t compute_pipeline_offset_64 = 0;
 
   for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
-    __nv_bfloat16 y_max_bf16 = EPS;
     __nv_bfloat162 results_bf162[2];
 
     cp_async_wait<NUM_STAGES - 2>();
@@ -405,7 +404,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
     auto _y_max2 =
         __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
 
-    y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
+    __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y));
 
     // An entire group is assigned to a single warp, so a simple warp reduce
     // is used.

From cc935fdd7e0c466cd556b6515e435dddd78677e0 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 18 Sep 2025 18:34:42 +0800
Subject: [PATCH 24/58] [Frontend] Support setting logprobs to -1 (#25031)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 tests/entrypoints/openai/test_chat_echo.py | 23 ++++++++++++++++++++++
 vllm/entrypoints/openai/protocol.py        |  8 +++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index 0f459dd3d8574..ce965eb829248 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -99,3 +99,26 @@ async def test_prompt_logprobs(client: openai.AsyncOpenAI):
 
     assert completion.prompt_logprobs is not None
     assert len(completion.prompt_logprobs) > 0
+
+
+@pytest.mark.asyncio
+async def test_top_logprobs(client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "Beijing is the capital of which country?"
+    }]
+
+    completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        extra_body={
+            "top_logprobs": -1,
+            "logprobs": "true",
+        },
+    )
+    assert completion.choices[0].logprobs is not None
+    assert completion.choices[0].logprobs.content is not None
+    assert len(completion.choices[0].logprobs.content) > 0
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index cff4a45fdc43e..7ad8e73d89d59 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -832,10 +832,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
                 raise ValueError("`prompt_logprobs=-1` is only supported with "
                                  "vLLM engine V1.")
         if (top_logprobs := data.get("top_logprobs")) is not None:
-            if top_logprobs < 0:
-                raise ValueError("`top_logprobs` must be a positive value.")
+            if top_logprobs < 0 and top_logprobs != -1:
+                raise ValueError(
+                    "`top_logprobs` must be a positive value or -1.")
 
-            if top_logprobs > 0 and not data.get("logprobs"):
+            if (top_logprobs == -1
+                    or top_logprobs > 0) and not data.get("logprobs"):
                 raise ValueError(
                     "when using `top_logprobs`, `logprobs` must be set to true."
                 )

From 37970105fed95d58677f0a4635cb253a71e8817c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 18 Sep 2025 19:04:21 +0800
Subject: [PATCH 25/58] [Model] Improve Pooling Model (#25149)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/layers/pooler.py | 12 ++++++------
 vllm/v1/worker/gpu_model_runner.py   |  1 +
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index b571a8f866990..4a97438b1bb2c 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -12,8 +12,9 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.config import ModelConfig, PoolerConfig
+from vllm.config import ModelConfig, PoolerConfig, get_current_vllm_config
 from vllm.logger import init_logger
+from vllm.model_executor.models.adapters import _load_st_projector
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.tasks import PoolingTask
@@ -377,7 +378,6 @@ class PoolerClassify(PoolerActivation):
         super().__init__()
 
         if static_num_labels:
-            from vllm.config import get_current_vllm_config
             vllm_config = get_current_vllm_config()
             self.num_labels = getattr(vllm_config.model_config.hf_config,
                                       "num_labels", 0)
@@ -427,8 +427,6 @@ class EmbeddingPoolerHead(PoolerHead):
         super().__init__(activation=PoolerNormalize())
 
         # Load ST projector if available
-        from vllm.config import get_current_vllm_config
-        from vllm.model_executor.models.adapters import _load_st_projector
 
         vllm_config = get_current_vllm_config()
         self.projector: Optional[nn.Module] = _load_st_projector(
@@ -489,7 +487,6 @@ class RewardPoolerHead(PoolerHead):
     def __init__(self) -> None:
         super().__init__(activation=PoolerClassify(static_num_labels=False))
 
-        from vllm.config import get_current_vllm_config
         vllm_config = get_current_vllm_config()
         self.head_dtype = vllm_config.model_config.head_dtype
 
@@ -638,7 +635,6 @@ class ClassifierPooler(Pooler):
     ) -> None:
         super().__init__()
 
-        from vllm.config import get_current_vllm_config
         vllm_config = get_current_vllm_config()
 
         self.pooling = pooling
@@ -730,3 +726,7 @@ class DispatchPooler(Pooler):
             offset += num_items
 
         return PoolerOutput(outputs)
+
+    def extra_repr(self) -> str:
+        s = f"supported_task={self.get_supported_tasks()}"
+        return s
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e8ad9c2fca07c..2e67984cb4327 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3151,6 +3151,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         model = cast(VllmModelForPooling, self.get_model())
         dummy_pooling_params = PoolingParams(task=task)
+        dummy_pooling_params.verify(task=task, model_config=self.model_config)
         to_update = model.pooler.get_pooling_updates(task)
         to_update.apply(dummy_pooling_params)
 

From 8ed039d52775aaee4a61663dd5d8c840f5eebd15 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 18 Sep 2025 12:24:27 +0100
Subject: [PATCH 26/58] Move `StructuredOutputsConfig` from
 `config/__init__.py` to `config/structured_outputs.py` (#25153)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/__init__.py           | 61 +----------------------------
 vllm/config/structured_outputs.py | 64 +++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 60 deletions(-)
 create mode 100644 vllm/config/structured_outputs.py

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 9a1c5f0b0d453..69ab5712d404c 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -42,6 +42,7 @@ from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
                                   ParallelConfig)
 from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
 from vllm.config.speculative import SpeculativeConfig
+from vllm.config.structured_outputs import StructuredOutputsConfig
 from vllm.config.utils import ConfigType, config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -2277,66 +2278,6 @@ def get_served_model_name(model: str,
     return served_model_name
 
 
-StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines",
-                                   "lm-format-enforcer"]
-
-
-@config
-@dataclass
-class StructuredOutputsConfig:
-    """Dataclass which contains structured outputs config for the engine."""
-
-    backend: StructuredOutputsBackend = "auto"
-    """Which engine will be used for structured outputs (e.g. JSON schema,
-    regex, etc) by default. With "auto", we will make opinionated choices
-    based on request contents and what the backend libraries currently support,
-    so the behavior is subject to change in each release."""
-
-    disable_fallback: bool = False
-    """If `True`, vLLM will not fallback to a different backend on error."""
-
-    disable_any_whitespace: bool = False
-    """If `True`, the model will not generate any whitespace during structured
-    outputs. This is only supported for xgrammar and guidance backends."""
-
-    disable_additional_properties: bool = False
-    """If `True`, the `guidance` backend will not use `additionalProperties`
-    in the JSON schema. This is only supported for the `guidance` backend and
-    is used to better align its behaviour with `outlines` and `xgrammar`."""
-
-    reasoning_parser: str = ""
-    """Select the reasoning parser depending on the model that you're using.
-    This is used to parse the reasoning content into OpenAI API format."""
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        # no factors to consider.
-        # this config will not affect the computation graph.
-        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def __post_init__(self):
-        if (self.disable_any_whitespace
-                and self.backend not in ("xgrammar", "guidance")):
-            raise ValueError("disable_any_whitespace is only supported for "
-                             "xgrammar and guidance backends.")
-        if (self.disable_additional_properties and self.backend != "guidance"):
-            raise ValueError("disable_additional_properties is only supported "
-                             "for the guidance backend.")
-
-
 DetailedTraceModules = Literal["model", "worker", "all"]
 
 
diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py
new file mode 100644
index 0000000000000..b1f14294510f8
--- /dev/null
+++ b/vllm/config/structured_outputs.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+from typing import Any, Literal
+
+from pydantic.dataclasses import dataclass
+
+from vllm.config.utils import config
+
+StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines",
+                                   "lm-format-enforcer"]
+
+
+@config
+@dataclass
+class StructuredOutputsConfig:
+    """Dataclass which contains structured outputs config for the engine."""
+
+    backend: StructuredOutputsBackend = "auto"
+    """Which engine will be used for structured outputs (e.g. JSON schema,
+    regex, etc) by default. With "auto", we will make opinionated choices
+    based on request contents and what the backend libraries currently support,
+    so the behavior is subject to change in each release."""
+    disable_fallback: bool = False
+    """If `True`, vLLM will not fallback to a different backend on error."""
+    disable_any_whitespace: bool = False
+    """If `True`, the model will not generate any whitespace during structured
+    outputs. This is only supported for xgrammar and guidance backends."""
+    disable_additional_properties: bool = False
+    """If `True`, the `guidance` backend will not use `additionalProperties`
+    in the JSON schema. This is only supported for the `guidance` backend and
+    is used to better align its behaviour with `outlines` and `xgrammar`."""
+    reasoning_parser: str = ""
+    """Select the reasoning parser depending on the model that you're using.
+    This is used to parse the reasoning content into OpenAI API format."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        # no factors to consider.
+        # this config will not affect the computation graph.
+        factors: list[Any] = []
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self):
+        if (self.disable_any_whitespace
+                and self.backend not in ("xgrammar", "guidance")):
+            raise ValueError("disable_any_whitespace is only supported for "
+                             "xgrammar and guidance backends.")
+        if (self.disable_additional_properties and self.backend != "guidance"):
+            raise ValueError("disable_additional_properties is only supported "
+                             "for the guidance backend.")

From eaffe4486cb1d7edf884e6e254cab33fc397e308 Mon Sep 17 00:00:00 2001
From: Kay Yan <kay.yan@daocloud.io>
Date: Thu, 18 Sep 2025 19:36:47 +0800
Subject: [PATCH 27/58] [Docs] Fix pooling-params doc references in
 openai_compatible_server.md (#24939)

---
 docs/api/README.md                       |  1 -
 docs/serving/openai_compatible_server.md | 20 ++++++++++++--------
 vllm/pooling_params.py                   | 20 ++++++++++++++------
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/docs/api/README.md b/docs/api/README.md
index 148211756480c..86e310f567dd3 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -46,7 +46,6 @@ Engine classes for offline and online inference.
 Inference parameters for vLLM APIs.
 
 [](){ #sampling-params }
-[](){ #pooling-params }
 
 - [vllm.SamplingParams][]
 - [vllm.PoolingParams][]
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index bc52d02a50bd2..bac3f6c1fe90c 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -317,10 +317,11 @@ Full example: <gh-file:examples/online_serving/pooling/openai_chat_embedding_cli
 
 #### Extra parameters
 
-The following [pooling parameters][pooling-params] are supported.
+The following [pooling parameters][vllm.PoolingParams] are supported.
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params"
+--8<-- "vllm/pooling_params.py:common-pooling-params"
+--8<-- "vllm/pooling_params.py:embedding-pooling-params"
 ```
 
 The following extra parameters are supported by default:
@@ -527,10 +528,11 @@ curl -v "http://127.0.0.1:8000/classify" \
 
 #### Extra parameters
 
-The following [pooling parameters][pooling-params] are supported.
+The following [pooling parameters][vllm.PoolingParams] are supported.
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params"
+--8<-- "vllm/pooling_params.py:common-pooling-params"
+--8<-- "vllm/pooling_params.py:classification-pooling-params"
 ```
 
 The following extra parameters are supported:
@@ -733,10 +735,11 @@ Full example: <gh-file:examples/online_serving/openai_cross_encoder_score_for_mu
 
 #### Extra parameters
 
-The following [pooling parameters][pooling-params] are supported.
+The following [pooling parameters][vllm.PoolingParams] are supported.
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params"
+--8<-- "vllm/pooling_params.py:common-pooling-params"
+--8<-- "vllm/pooling_params.py:classification-pooling-params"
 ```
 
 The following extra parameters are supported:
@@ -815,10 +818,11 @@ Result documents will be sorted by relevance, and the `index` property can be us
 
 #### Extra parameters
 
-The following [pooling parameters][pooling-params] are supported.
+The following [pooling parameters][vllm.PoolingParams] are supported.
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params"
+--8<-- "vllm/pooling_params.py:common-pooling-params"
+--8<-- "vllm/pooling_params.py:classification-pooling-params"
 ```
 
 The following extra parameters are supported:
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 6672392b8d080..a6313367457a4 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -20,25 +20,33 @@ class PoolingParams(
     """API parameters for pooling models.
 
     Attributes:
+        truncate_prompt_tokens: Controls prompt truncation.
+            Set to -1 to use the model's default truncation size.
+            Set to k to keep only the last k tokens (left truncation).
+            Set to None to disable truncation.         
         normalize: Whether to normalize the embeddings outputs.
         dimensions: Reduce the dimensions of embeddings
-                    if model support matryoshka representation.
+            if model support matryoshka representation.
         activation: Whether to apply activation function to
-                    the classification outputs.
+            the classification outputs.
         softmax: Whether to apply softmax to the reward outputs.
     """
+
+    # --8<-- [start:common-pooling-params]
     truncate_prompt_tokens: Optional[Annotated[int,
                                                msgspec.Meta(ge=-1)]] = None
-    """If set to -1, will use the truncation size supported by the model. If
-    set to an integer k, will use only the last k tokens from the prompt
-    (i.e., left truncation). If set to `None`, truncation is disabled."""
+    # --8<-- [end:common-pooling-params]
 
     ## for embeddings models
+    # --8<-- [start:embedding-pooling-params]
     dimensions: Optional[int] = None
     normalize: Optional[bool] = None
+    # --8<-- [end:embedding-pooling-params]
 
-    ## for classification models
+    ## for classification, scoring and rerank
+    # --8<-- [start:classification-pooling-params]
     activation: Optional[bool] = None
+    # --8<-- [end:classification-pooling-params]
 
     ## for reward models
     softmax: Optional[bool] = None

From c9ff9e6f0cf48615bad1525caeef3025c62b2720 Mon Sep 17 00:00:00 2001
From: William Song <jinwook@umich.edu>
Date: Thu, 18 Sep 2025 20:37:08 +0900
Subject: [PATCH 28/58] [Docs] add the parallel sampling usage in LLMEngine and
 AsyncLLM (#24222)

---
 vllm/sampling_params.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 0a01cb0260ae5..efe70d019ccc6 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -81,7 +81,13 @@ class SamplingParams(
     """
 
     n: int = 1
-    """Number of output sequences to return for the given prompt."""
+    """Number of outputs to return for the given prompt request.
+
+    NOTE:
+        `AsyncLLM` streams outputs by default. When `n > 1`, all `n` outputs
+        are generated and streamed cumulatively per request. To see all `n`
+        outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
+        in `SamplingParams`."""
     best_of: Optional[int] = None
     """Number of output sequences that are generated from the prompt. From
     these `best_of` sequences, the top `n` sequences are returned. `best_of`

From 5a33ae9a3faae79cad9d2659862fcd8d86483659 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 18 Sep 2025 12:41:41 +0100
Subject: [PATCH 29/58] Fix forward reference warning in documentation (#25150)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/async_timeout.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
index 28a023a71ef52..3b9c055160c1b 100644
--- a/vllm/engine/async_timeout.py
+++ b/vllm/engine/async_timeout.py
@@ -16,19 +16,6 @@ if sys.version_info[:2] >= (3, 11):
     from asyncio import timeout as asyncio_timeout
 else:
 
-    def asyncio_timeout(delay: Optional[float]) -> "Timeout":
-        """timeout context manager.
-        Useful in cases when you want to apply timeout logic around block
-        of code or in cases when asyncio.wait_for is not suitable. For example:
-        >>> async with timeout(0.001):
-        ...     async with aiohttp.get('https://github.com') as r:
-        ...         await r.text()
-        delay - value in seconds or None to disable timeout logic
-        """
-        loop = asyncio.get_running_loop()
-        deadline = loop.time() + delay if delay is not None else None
-        return Timeout(deadline, loop)
-
     class _State(enum.Enum):
         INIT = "INIT"
         ENTER = "ENTER"
@@ -171,3 +158,16 @@ else:
             self._state = _State.TIMEOUT
             # drop the reference early
             self._timeout_handler = None
+
+    def asyncio_timeout(delay: Optional[float]) -> Timeout:
+        """timeout context manager.
+        Useful in cases when you want to apply timeout logic around block
+        of code or in cases when asyncio.wait_for is not suitable. For example:
+        >>> async with timeout(0.001):
+        ...     async with aiohttp.get('https://github.com') as r:
+        ...         await r.text()
+        delay - value in seconds or None to disable timeout logic
+        """
+        loop = asyncio.get_running_loop()
+        deadline = loop.time() + delay if delay is not None else None
+        return Timeout(deadline, loop)

From 3ed1ec4af25a9cb7dcfea74b839864fc3c8ba09d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 18 Sep 2025 13:06:28 +0100
Subject: [PATCH 30/58] Fix `validate-config` pre-commit check (#25157)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .pre-commit-config.yaml  |  4 +---
 tools/validate_config.py | 23 ++++++++++++++++-------
 vllm/config/__init__.py  |  2 ++
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c16bdeeecd07a..13ad3af97d839 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -164,9 +164,7 @@ repos:
     name: Validate configuration has default values and that each field has a docstring
     entry: python tools/validate_config.py
     language: python
-    types: [python]
-    pass_filenames: true
-    files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
+    additional_dependencies: [regex]
   # Keep `suggestion` last
   - id: suggestion
     name: Suggestion
diff --git a/tools/validate_config.py b/tools/validate_config.py
index 8b1e955c653d7..f6439fa9ada5f 100644
--- a/tools/validate_config.py
+++ b/tools/validate_config.py
@@ -9,6 +9,8 @@ import ast
 import inspect
 import sys
 
+import regex as re
+
 
 def get_attr_docs(cls_node: ast.ClassDef) -> dict[str, str]:
     """
@@ -88,11 +90,12 @@ def validate_class(class_node: ast.ClassDef):
     for stmt in class_node.body:
         # A field is defined as a class variable that has a type annotation.
         if isinstance(stmt, ast.AnnAssign):
-            # Skip ClassVar
+            # Skip ClassVar and InitVar
             # see https://docs.python.org/3/library/dataclasses.html#class-variables
-            if isinstance(stmt.annotation, ast.Subscript) and isinstance(
-                    stmt.annotation.value,
-                    ast.Name) and stmt.annotation.value.id == "ClassVar":
+            # and https://docs.python.org/3/library/dataclasses.html#init-only-variables
+            if (isinstance(stmt.annotation, ast.Subscript)
+                    and isinstance(stmt.annotation.value, ast.Name)
+                    and stmt.annotation.value.id in {"ClassVar", "InitVar"}):
                 continue
 
             if isinstance(stmt.target, ast.Name):
@@ -132,7 +135,7 @@ def validate_ast(tree: ast.stmt):
 
 def validate_file(file_path: str):
     try:
-        print(f"validating {file_path} config dataclasses ", end="")
+        print(f"Validating {file_path} config dataclasses ", end="")
         with open(file_path, encoding="utf-8") as f:
             source = f.read()
 
@@ -140,7 +143,7 @@ def validate_file(file_path: str):
         validate_ast(tree)
     except ValueError as e:
         print(e)
-        SystemExit(2)
+        raise SystemExit(1) from e
     else:
         print("✅")
 
@@ -151,7 +154,13 @@ def fail(message: str, node: ast.stmt):
 
 def main():
     for filename in sys.argv[1:]:
-        validate_file(filename)
+        # Only run for Python files in vllm/ or tests/
+        if not re.match(r"^(vllm|tests)/.*\.py$", filename):
+            continue
+        # Only run if the file contains @config
+        with open(filename, encoding="utf-8") as f:
+            if "@config" in f.read():
+                validate_file(filename)
 
 
 if __name__ == "__main__":
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 69ab5712d404c..25daca00c02d9 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -450,6 +450,8 @@ class ModelConfig:
 
     # Multimodal config and init vars
     multimodal_config: Optional[MultiModalConfig] = None
+    """Configuration for multimodal model. If `None`, this will be inferred
+    from the architecture of `self.model`."""
     limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None
     media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
     mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None

From 66072b36dbf1707440ff43d57273d9e9974349d7 Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Thu, 18 Sep 2025 15:21:17 +0300
Subject: [PATCH 31/58] [Bugfix][Mamba] - Fix Conv State Kernel FP32 Support
 (#24883)

Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
---
 tests/models/language/generation/test_hybrid.py       |  9 ++++++---
 vllm/model_executor/layers/mamba/ops/causal_conv1d.py | 10 ++++++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index d0e42062099ec..206ad1352e06e 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -418,7 +418,9 @@ def test_full_cuda_graph(
 @pytest.mark.parametrize("model", FP32_STATE_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_fp32_state(
+@pytest.mark.parametrize("cache_dtype_param",
+                         ["mamba_ssm_cache_dtype", "mamba_cache_dtype"])
+def test_fp32_cache_state(
     hf_runner,
     vllm_runner,
     example_prompts,
@@ -426,6 +428,7 @@ def test_fp32_state(
     model: str,
     max_tokens: int,
     num_logprobs: int,
+    cache_dtype_param: str,
 ) -> None:
 
     try:
@@ -443,13 +446,13 @@ def test_fp32_state(
         m.setenv("VLLM_USE_V1", "0")
         with vllm_runner(model,
                          max_num_seqs=MAX_NUM_SEQS,
-                         mamba_ssm_cache_dtype="float32") as vllm_model:
+                         **{cache_dtype_param: "float32"}) as vllm_model:
             vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, num_logprobs)
 
     with vllm_runner(model,
                      max_num_seqs=MAX_NUM_SEQS,
-                     mamba_ssm_cache_dtype="float32") as vllm_model:
+                     **{cache_dtype_param: "float32"}) as vllm_model:
         vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index 2a88fa661da01..8cfd0962c5bfe 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -415,6 +415,9 @@ def causal_conv1d_fn(
         activation = "silu"
 
     args = None
+    # Store original dtype to cast back at the end
+    original_x_dtype = x.dtype
+    x = x.to(conv_states.dtype)
     out = torch.empty_like(x)
     if metadata is not None:
         cu_seqlen = metadata.cu_seqlen
@@ -613,7 +616,7 @@ def causal_conv1d_fn(
         BLOCK_N=256,
         num_stages=2,
     )
-    return out
+    return out.to(original_x_dtype)
 
 
 @triton.jit()
@@ -973,6 +976,9 @@ def causal_conv1d_update(
         activation = "silu" if activation is True else None
     elif activation is not None:
         assert activation in ["silu", "swish"]
+
+    original_x_dtype = x.dtype
+    x = x.to(conv_state.dtype)
     unsqueeze = query_start_loc is None and x.dim() == 2
     if unsqueeze:
         # make it (batch, dim, seqlen) with seqlen == 1
@@ -1081,4 +1087,4 @@ def causal_conv1d_update(
     )
     if unsqueeze:
         out = out.squeeze(-1)
-    return out
+    return out.to(original_x_dtype)

From 21da73343ad35f756e053ba4155dafb05229b0c5 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Thu, 18 Sep 2025 05:43:33 -0700
Subject: [PATCH 32/58] [Misc] Clean up flags in `vllm bench serve` (#25138)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 docs/contributing/benchmarks.md    |  3 --
 tests/benchmarks/test_serve_cli.py |  2 +-
 vllm/benchmarks/datasets.py        |  8 ++---
 vllm/benchmarks/serve.py           | 49 +++++++++++++++++++++---------
 4 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index d04b1d1136a1c..2a03ce1dffd63 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -156,7 +156,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
   --backend openai-chat \
-  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -230,7 +229,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
   --backend openai-chat \
-  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -245,7 +243,6 @@ vllm bench serve \
 ```bash
 vllm bench serve \
   --backend openai-chat \
-  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index 5471d6b8e4a5f..fafbef5f37180 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -68,7 +68,7 @@ def test_bench_serve_chat(server):
         "5",
         "--endpoint",
         "/v1/chat/completions",
-        "--endpoint-type",
+        "--backend",
         "openai-chat",
     ]
     result = subprocess.run(command, capture_output=True, text=True)
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 1cab40802c392..68a937d5750ec 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1358,7 +1358,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
     elif args.dataset_name == "sonnet":
         dataset = SonnetDataset(dataset_path=args.dataset_path)
         # For the "sonnet" dataset, formatting depends on the backend.
-        if args.endpoint_type == "openai-chat":
+        if args.backend == "openai-chat":
             input_requests = dataset.sample(
                 num_requests=args.num_prompts,
                 input_len=args.sonnet_input_len,
@@ -1462,7 +1462,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 "Please consider contributing if you would "
                 "like to add support for additional dataset formats.")
 
-        if dataset_class.IS_MULTIMODAL and args.endpoint_type not in [
+        if dataset_class.IS_MULTIMODAL and args.backend not in [
                 "openai-chat",
                 "openai-audio",
         ]:
@@ -1470,7 +1470,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             # endpoint-type.
             raise ValueError(
                 "Multi-modal content is only supported on 'openai-chat' and "
-                "'openai-audio' endpoint-type.")
+                "'openai-audio' backends.")
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
@@ -1563,7 +1563,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
 
         try:
             # Enforce endpoint compatibility for multimodal datasets.
-            if args.dataset_name == "random-mm" and args.endpoint_type not in [
+            if args.dataset_name == "random-mm" and args.backend not in [
                     "openai-chat"]:
                 raise ValueError(
                     "Multi-modal content (images) is only supported on "
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index d8784340eba15..7382782f11655 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -8,8 +8,8 @@ to launch the vLLM OpenAI API server:
 
 On the client side, run:
     vllm bench serve \
-        --endpoint-type <endpoint_type. Default 'openai'> \
-        --label <benchmark result label. Default using endpoint_type> \
+        --backend <backend or endpoint type. Default 'openai'> \
+        --label <benchmark result label. Default using backend> \
         --model <your_model> \
         --dataset-name <dataset_name. Default 'random'> \
         --request-rate <request_rate. Default inf> \
@@ -52,6 +52,21 @@ TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None)
                           and (shutil.which("gnuplot") is not None))
 
 
+# TODO: Remove this in v0.11.0
+class DeprecatedEndpointTypeAction(argparse.Action):
+    """Argparse action for the deprecated --endpoint-type flag.
+    """
+
+    def __call__(self, _, namespace, values, option_string=None):
+        warnings.warn(
+            "'--endpoint-type' is deprecated and will be removed in v0.11.0. "
+            "Please use '--backend' instead or remove this argument if you "
+            "have already set it.",
+            stacklevel=1,
+        )
+        setattr(namespace, self.dest, values)
+
+
 class TaskType(Enum):
     GENERATION = "generation"
     EMBEDDING = "embedding"
@@ -470,7 +485,7 @@ async def benchmark(
         else:
             request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
     else:
-        raise ValueError(f"Unknown endpoint_type: {endpoint_type}")
+        raise ValueError(f"Unknown backend: {endpoint_type}")
 
     # Reuses connections across requests to reduce TLS handshake overhead.
     connector = aiohttp.TCPConnector(
@@ -850,24 +865,28 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
 
 def add_cli_args(parser: argparse.ArgumentParser):
     add_dataset_parser(parser)
-    parser.add_argument(
-        "--endpoint-type",
-        type=str,
-        default="openai",
-        choices=list(ASYNC_REQUEST_FUNCS.keys()),
-    )
     parser.add_argument(
         "--label",
         type=str,
         default=None,
         help="The label (prefix) of the benchmark results. If not specified, "
-        "the endpoint type will be used as the label.",
+        "the value of '--backend' will be used as the label.",
     )
     parser.add_argument(
         "--backend",
         type=str,
-        default="vllm",
+        default="openai",
         choices=list(ASYNC_REQUEST_FUNCS.keys()),
+        help="The type of backend or endpoint to use for the benchmark."
+    )
+    parser.add_argument(
+        "--endpoint-type",
+        type=str,
+        default=None,
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+        action=DeprecatedEndpointTypeAction,
+        help="'--endpoint-type' is deprecated and will be removed in v0.11.0. "
+        "Please use '--backend' instead.",
     )
     parser.add_argument(
         "--base-url",
@@ -1165,7 +1184,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             raise ValueError(
                 "For exponential ramp-up, the start RPS cannot be 0.")
 
-    endpoint_type = args.endpoint_type
     label = args.label
     model_id = args.model
     model_name = args.served_model_name
@@ -1228,7 +1246,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     gc.freeze()
 
     benchmark_result = await benchmark(
-        endpoint_type=args.endpoint_type,
+        endpoint_type=args.backend,
         api_url=api_url,
         base_url=base_url,
         model_id=model_id,
@@ -1262,7 +1280,8 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     # Setup
     current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
     result_json["date"] = current_dt
-    result_json["endpoint_type"] = args.endpoint_type
+    result_json["endpoint_type"] = args.backend # for backward compatibility
+    result_json["backend"] = args.backend
     result_json["label"] = label
     result_json["model_id"] = model_id
     result_json["tokenizer_id"] = tokenizer_id
@@ -1312,7 +1331,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         base_model_id = model_id.split("/")[-1]
         max_concurrency_str = (f"-concurrency{args.max_concurrency}"
                                if args.max_concurrency is not None else "")
-        label = label or endpoint_type
+        label = label or args.backend
         if args.ramp_up_strategy is not None:
             file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
         else:

From 470484a4f503d4768008c2f5a8dc828dc90633b4 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Thu, 18 Sep 2025 20:44:31 +0800
Subject: [PATCH 33/58] [Structured Output][Refactor] Move
 `apply_grammar_bitmask()` method from `ModelRunner` to structured output
 utils (#21999)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 vllm/v1/structured_output/utils.py | 80 ++++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py | 75 ++--------------------------
 2 files changed, 84 insertions(+), 71 deletions(-)

diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 953185a8fc31d..127c8876525b5 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -8,7 +8,9 @@ import importlib.metadata
 import os
 from typing import TYPE_CHECKING
 
+import numpy as np
 import regex as re
+import torch
 from cachetools import LRUCache
 from diskcache import Cache
 
@@ -20,9 +22,13 @@ if TYPE_CHECKING:
     import outlines_core as oc
     import transformers.file_utils as file_utils
     import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2
+    import xgrammar as xgr
 
     from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
 else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
     oc = LazyLoader("oc", globals(), "outlines_core")
     file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils")
     tokenization_gpt2 = LazyLoader(
@@ -36,6 +42,80 @@ logger = init_logger(__name__)
 CACHE = None
 
 
+def apply_grammar_bitmask(
+    scheduler_output: SchedulerOutput,
+    input_batch: InputBatch,
+    logits: torch.Tensor,
+    device: torch.device,
+) -> None:
+    """
+    Apply grammar bitmask to output logits of the model with xgrammar function.
+
+    Args:
+        scheduler_output (SchedulerOutput): The result of engine scheduling.
+        input_batch (InputBatch): The input of model runner.
+        logits (torch.Tensor): The output logits of model forward.
+        device (torch.device): The device that model runner running on.
+    """
+    grammar_bitmask = scheduler_output.grammar_bitmask
+    if grammar_bitmask is None:
+        return
+
+    # We receive the structured output bitmask from the scheduler,
+    # compacted to contain bitmasks only for structured output requests.
+    # The order of the requests in the bitmask is not guaranteed to be the
+    # same as the order of the requests in the gpu runner's batch. We need
+    # to sort the bitmask to match the order of the requests used here.
+
+    # Get the batch indices of the structured output requests.
+    # Keep track of the number of speculative tokens scheduled for every
+    # request in the batch, as the logit indices are offset by this amount.
+    struct_out_req_batch_indices: dict[str, int] = {}
+    cumulative_offset = 0
+    seq = sorted(input_batch.req_id_to_index.items(), key=lambda x: x[1])
+    for req_id, batch_index in seq:
+        logit_index = batch_index + cumulative_offset
+        cumulative_offset += len(
+            scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
+        if req_id in scheduler_output.structured_output_request_ids:
+            struct_out_req_batch_indices[req_id] = logit_index
+
+    out_indices = []
+
+    # Reorder the bitmask to match the order of the requests in the batch.
+    sorted_bitmask = np.full(shape=(logits.shape[0], grammar_bitmask.shape[1]),
+                             fill_value=-1,
+                             dtype=grammar_bitmask.dtype)
+    cumulative_index = 0
+    seq = sorted(scheduler_output.structured_output_request_ids.items(),
+                 key=lambda x: x[1])
+    for req_id, _ in seq:
+        logit_index = struct_out_req_batch_indices[req_id]
+        num_spec_tokens = len(
+            scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
+        for i in range(1 + num_spec_tokens):
+            sorted_bitmask[logit_index + i] = \
+                grammar_bitmask[cumulative_index + i]
+            out_indices.append(logit_index + i)
+        cumulative_index += 1 + num_spec_tokens
+    grammar_bitmask = sorted_bitmask
+
+    # If the length of out indices and the logits have the same shape
+    # we don't need to pass indices to the kernel,
+    # since the bitmask is already aligned with the logits.
+    skip_out_indices = len(out_indices) == logits.shape[0]
+
+    # Serialization of np.ndarray is much more efficient than a tensor,
+    # so we receive it in that format.
+    grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous()
+
+    xgr.apply_token_bitmask_inplace(
+        logits,
+        grammar_bitmask.to(device, non_blocking=True),
+        indices=out_indices if not skip_out_indices else None,
+    )
+
+
 class OutlinesVocabulary:
     """
     Wrapper class for `outlines_core.Vocabulary`,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2e67984cb4327..4873b586724ec 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -54,7 +54,7 @@ from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size,
+                        GiB_bytes, check_use_alibi, get_dtype_size,
                         is_pin_memory_available, round_up, supports_dynamo)
 from vllm.v1.attention.backends.flash_attn import AttentionMetadata
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
@@ -85,6 +85,7 @@ from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
@@ -101,12 +102,8 @@ from .utils import (AttentionGroup, MultiModalBudget,
                     scatter_mm_placeholders)
 
 if TYPE_CHECKING:
-    import xgrammar as xgr
-
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
     from vllm.v1.core.sched.output import SchedulerOutput
-else:
-    xgr = LazyLoader("xgr", globals(), "xgrammar")
 
 logger = init_logger(__name__)
 
@@ -1617,71 +1614,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         return tuple(tasks)
 
-    def apply_grammar_bitmask(
-        self,
-        scheduler_output: "SchedulerOutput",
-        logits: torch.Tensor,
-    ):
-        grammar_bitmask = scheduler_output.grammar_bitmask
-        if grammar_bitmask is None:
-            return
-
-        # We receive the structured output bitmask from the scheduler,
-        # compacted to contain bitmasks only for structured output requests.
-        # The order of the requests in the bitmask is not guaranteed to be the
-        # same as the order of the requests in the gpu runner's batch. We need
-        # to sort the bitmask to match the order of the requests used here.
-
-        # Get the batch indices of the structured output requests.
-        # Keep track of the number of speculative tokens scheduled for every
-        # request in the batch, as the logit indices are offset by this amount.
-        struct_out_req_batch_indices: dict[str, int] = {}
-        cumulative_offset = 0
-        seq = sorted(self.input_batch.req_id_to_index.items(),
-                     key=lambda x: x[1])
-        for req_id, batch_index in seq:
-            logit_index = batch_index + cumulative_offset
-            cumulative_offset += len(
-                scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
-            if req_id in scheduler_output.structured_output_request_ids:
-                struct_out_req_batch_indices[req_id] = logit_index
-
-        out_indices = []
-
-        # Reorder the bitmask to match the order of the requests in the batch.
-        sorted_bitmask = np.full(shape=(logits.shape[0],
-                                        grammar_bitmask.shape[1]),
-                                 fill_value=-1,
-                                 dtype=grammar_bitmask.dtype)
-        cumulative_index = 0
-        seq = sorted(scheduler_output.structured_output_request_ids.items(),
-                     key=lambda x: x[1])
-        for req_id, _ in seq:
-            logit_index = struct_out_req_batch_indices[req_id]
-            num_spec_tokens = len(
-                scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
-            for i in range(1 + num_spec_tokens):
-                sorted_bitmask[logit_index + i] = \
-                    grammar_bitmask[cumulative_index + i]
-                out_indices.append(logit_index + i)
-            cumulative_index += 1 + num_spec_tokens
-        grammar_bitmask = sorted_bitmask
-
-        # If the length of out indices and the logits have the same shape
-        # we don't need to pass indices to the kernel,
-        # since the bitmask is already aligned with the logits.
-        skip_out_indices = len(out_indices) == logits.shape[0]
-
-        # Serialization of np.ndarray is much more efficient than a tensor,
-        # so we receive it in that format.
-        grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous()
-
-        xgr.apply_token_bitmask_inplace(
-            logits,
-            grammar_bitmask.to(self.device, non_blocking=True),
-            indices=out_indices if not skip_out_indices else None,
-        )
-
     def sync_and_slice_intermediate_tensors(
             self, num_tokens: int, intermediate_tensors: IntermediateTensors,
             sync_self: bool) -> IntermediateTensors:
@@ -2232,7 +2164,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
             # Apply structured output bitmasks if present
             if scheduler_output.grammar_bitmask is not None:
-                self.apply_grammar_bitmask(scheduler_output, logits)
+                apply_grammar_bitmask(scheduler_output, self.input_batch,
+                                      logits, self.device)
 
         with record_function_or_nullcontext("Sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)

From fbd6523ac00082c398dc8126434cede595169609 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 18 Sep 2025 08:53:45 -0400
Subject: [PATCH 34/58] Refactor dense FP8 tensor/channel/block utils and add
 CT FP8 block (#21404)

---
 vllm/model_executor/layers/linear.py          |  14 +-
 .../compressed_tensors/compressed_tensors.py  |  68 ++---
 .../schemes/compressed_tensors_w8a8_fp8.py    | 191 ++++++-------
 .../model_executor/layers/quantization/fp8.py | 265 ++++++------------
 .../layers/quantization/utils/fp8_utils.py    | 220 +++++++++++++++
 5 files changed, 441 insertions(+), 317 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index cd05136520977..5bf96398bc710 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -805,12 +805,10 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         assert loaded_shard_id < len(self.output_sizes)
 
         if isinstance(param, BlockQuantScaleParameter):
-            from vllm.model_executor.layers.quantization.fp8 import (
-                Fp8LinearMethod, Fp8MoEMethod)
             assert self.quant_method is not None
-            assert isinstance(self.quant_method,
-                              (Fp8LinearMethod, Fp8MoEMethod))
-            weight_block_size = self.quant_method.quant_config.weight_block_size
+            # Assume the weight block size has been set by quant method
+            assert hasattr(self, "weight_block_size")
+            weight_block_size = self.weight_block_size
             assert weight_block_size is not None
             block_n, _ = weight_block_size[0], weight_block_size[1]
             shard_offset = (
@@ -989,8 +987,10 @@ class QKVParallelLinear(ColumnParallelLinear):
         # Note(simon): This is needed for Qwen3's fp8 quantization.
         if isinstance(param, BlockQuantScaleParameter):
             assert self.quant_method is not None
-            assert hasattr(self.quant_method, "quant_config")
-            weight_block_size = self.quant_method.quant_config.weight_block_size
+            # Assume the weight block size has been set by quant method
+            assert hasattr(self, "weight_block_size")
+            weight_block_size = self.weight_block_size
+            assert weight_block_size is not None
             block_n, _ = weight_block_size[0], weight_block_size[1]
             shard_offset = (shard_offset + block_n - 1) // block_n
             shard_size = (shard_size + block_n - 1) // block_n
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b56a691311774..d6550dd16892f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -12,7 +12,6 @@ from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
 from compressed_tensors.transform import TransformConfig
-from pydantic import BaseModel
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -268,7 +267,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         else:
             return False
 
-    def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel):
+    def _is_fp4a4_nvfp4(self, weight_quant: QuantizationArgs,
+                        input_quant: QuantizationArgs):
 
         if weight_quant is None or input_quant is None:
             return False
@@ -288,8 +288,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         return (is_tensor_group_quant and is_float_type and is_4_bits
                 and is_group_size_16 and is_symmetric)
 
-    def _is_fp4a16_nvfp4(self, weight_quant: BaseModel,
-                         input_quant: BaseModel):
+    def _is_fp4a16_nvfp4(self, weight_quant: QuantizationArgs,
+                         input_quant: QuantizationArgs):
 
         is_weight_only = weight_quant is not None and input_quant is None
         is_tensor_group_quant = (
@@ -303,8 +303,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         return (is_weight_only and is_tensor_group_quant and is_float_type
                 and is_4_bits and is_group_size_16 and is_symmetric)
 
-    def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
-                               input_quant: BaseModel) -> bool:
+    def _is_static_tensor_w8a8(self, weight_quant: QuantizationArgs,
+                               input_quant: QuantizationArgs) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
             weight_quant.strategy == QuantizationStrategy.TENSOR.value
@@ -317,8 +317,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         # Only symmetric weight quantization supported.
         return is_8_bits and is_tensor and weight_quant.symmetric and is_static
 
-    def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
-                               input_quant: BaseModel) -> bool:
+    def _is_dynamic_token_w8a8(self, weight_quant: QuantizationArgs,
+                               input_quant: QuantizationArgs) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
             weight_quant.strategy == QuantizationStrategy.TENSOR.value
@@ -331,8 +331,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         # Only symmetric weight quantization supported.
         return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
 
-    def _is_dynamic_token_w4a8_int(self, weight_quant: BaseModel,
-                                   input_quant: BaseModel) -> bool:
+    def _is_dynamic_token_w4a8_int(self, weight_quant: QuantizationArgs,
+                                   input_quant: QuantizationArgs) -> bool:
         is_weight_4_bits = weight_quant.num_bits == 4
         is_activation_8_bits = input_quant.num_bits == 8
         weight_strategy = (
@@ -347,8 +347,8 @@ class CompressedTensorsConfig(QuantizationConfig):
         return (is_weight_4_bits and is_activation_8_bits and is_token
                 and weight_quant.symmetric and is_dynamic)
 
-    def _is_fp8_w8a8(self, weight_quant: BaseModel,
-                     input_quant: BaseModel) -> bool:
+    def _is_fp8_w8a8(self, weight_quant: QuantizationArgs,
+                     input_quant: QuantizationArgs) -> bool:
         # Confirm weights and activations quantized.
         if weight_quant is None or input_quant is None:
             return False
@@ -358,11 +358,12 @@ class CompressedTensorsConfig(QuantizationConfig):
                              and input_quant.type == QuantizationType.FLOAT)
         is_symmetric_weight = weight_quant.symmetric
         is_static_weight = not weight_quant.dynamic
-        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
-            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
+        is_tensor_or_channel_or_block_weight = (weight_quant.strategy in [
+            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL,
+            QuantizationStrategy.BLOCK
         ])
         if not (is_floating_point and is_symmetric_weight and is_static_weight
-                and is_per_tensor_or_channel_weight):
+                and is_tensor_or_channel_or_block_weight):
             return False
 
         # Dynamic quantization is always supported if weights supported.
@@ -375,8 +376,8 @@ class CompressedTensorsConfig(QuantizationConfig):
             input_quant.strategy == QuantizationStrategy.TENSOR)
         return is_symmetric_activation and is_per_tensor_activation
 
-    def _is_fp8_w4a8(self, weight_quant: BaseModel,
-                     input_quant: BaseModel) -> bool:
+    def _is_fp8_w4a8(self, weight_quant: QuantizationArgs,
+                     input_quant: QuantizationArgs) -> bool:
         if not weight_quant or not input_quant:
             return False
         is_weight_4_bits = weight_quant.num_bits == 4
@@ -392,24 +393,24 @@ class CompressedTensorsConfig(QuantizationConfig):
         return (is_weight_4_bits and is_activation_8_bits and is_token
                 and is_symmetric and is_dynamic)
 
-    def _is_fp8_w4a8_sm90(self, weight_quant: BaseModel,
-                          input_quant: BaseModel) -> bool:
+    def _is_fp8_w4a8_sm90(self, weight_quant: QuantizationArgs,
+                          input_quant: QuantizationArgs) -> bool:
         return (self._check_scheme_supported(90, error=False, match_exact=True)
                 and self._is_fp8_w4a8(weight_quant, input_quant))
 
-    def _is_fp8_w8a8_sm90(self, weight_quant: BaseModel,
-                          input_quant: BaseModel) -> bool:
+    def _is_fp8_w8a8_sm90(self, weight_quant: QuantizationArgs,
+                          input_quant: QuantizationArgs) -> bool:
         return (self._check_scheme_supported(90, error=False, match_exact=True)
                 and self._is_fp8_w8a8(weight_quant, input_quant))
 
-    def _is_fp8_w8a8_sm100(self, weight_quant: BaseModel,
-                           input_quant: BaseModel) -> bool:
+    def _is_fp8_w8a8_sm100(self, weight_quant: QuantizationArgs,
+                           input_quant: QuantizationArgs) -> bool:
         return (self._check_scheme_supported(
             100, error=False, match_exact=True)
                 and self._is_fp8_w8a8(weight_quant, input_quant))
 
-    def _is_fp8_w8a16(self, weight_quant: BaseModel,
-                      input_quant: BaseModel) -> bool:
+    def _is_fp8_w8a16(self, weight_quant: QuantizationArgs,
+                      input_quant: QuantizationArgs) -> bool:
         # Confirm weights quantized.
         if weight_quant is None:
             return False
@@ -421,18 +422,19 @@ class CompressedTensorsConfig(QuantizationConfig):
         # Confirm weight scheme is supported.
         is_symmetric_weight = weight_quant.symmetric
         is_static_weight = not weight_quant.dynamic
-        is_per_tensor_or_channel_weight = (weight_quant.strategy in [
-            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
+        is_tensor_or_channel_or_block_weight = (weight_quant.strategy in [
+            QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL,
+            QuantizationStrategy.BLOCK
         ])
         if not (is_symmetric_weight and is_static_weight  # noqa: SIM103
-                and is_per_tensor_or_channel_weight):
+                and is_tensor_or_channel_or_block_weight):
             return False
 
         # All conditions satisfied.
         return True
 
-    def _is_wNa16_group_channel(self, weight_quant: BaseModel,
-                                input_quant: BaseModel) -> bool:
+    def _is_wNa16_group_channel(self, weight_quant: QuantizationArgs,
+                                input_quant: QuantizationArgs) -> bool:
         input_quant_none = input_quant is None
         is_channel_group = (
             weight_quant.strategy == QuantizationStrategy.CHANNEL.value
@@ -443,8 +445,8 @@ class CompressedTensorsConfig(QuantizationConfig):
 
     def _get_scheme_from_parts(
             self,
-            weight_quant: BaseModel,
-            input_quant: BaseModel,
+            weight_quant: QuantizationArgs,
+            input_quant: QuantizationArgs,
             format: Optional[str] = None) -> "CompressedTensorsScheme":
 
         # use the per-layer format if defined, otherwise, use global format
@@ -496,7 +498,7 @@ class CompressedTensorsConfig(QuantizationConfig):
                     CompressedTensorsW8A8Fp8.get_min_capability(), error=False)
                 if is_fp8_w8a8_supported:
                     return CompressedTensorsW8A8Fp8(
-                        strategy=weight_quant.strategy,
+                        weight_quant=weight_quant,
                         is_static_input_scheme=(input_quant
                                                 and not input_quant.dynamic))
                 else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index d984e89d9e02a..d42ae22c51393 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -4,28 +4,41 @@
 from typing import Callable, Optional
 
 import torch
-from compressed_tensors.quantization import QuantizationStrategy
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy)
 from torch.nn import Parameter
 
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    apply_fp8_block_linear, check_aiter_fp8_linear_support,
+    create_fp8_input_scale, create_fp8_scale_parameter,
+    create_fp8_weight_parameter, maybe_post_process_fp8_weight_block,
+    process_fp8_weight_block_strategy, process_fp8_weight_channel_strategy,
+    process_fp8_weight_tensor_strategy, validate_fp8_block_shape)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    Fp8LinearOp, maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz,
-    requantize_with_max_scale)
-from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
-                                           ModelWeightParameter,
+    Fp8LinearOp, cutlass_block_fp8_supported, maybe_create_device_identity)
+from vllm.model_executor.parameter import (BlockQuantScaleParameter,
+                                           ChannelQuantScaleParameter,
                                            PerTensorScaleParameter)
-from vllm.platforms import current_platform
 
 __all__ = ["CompressedTensorsW8A8Fp8"]
 
+strategy_to_parameter_type = {
+    QuantizationStrategy.BLOCK: BlockQuantScaleParameter,
+    QuantizationStrategy.CHANNEL: ChannelQuantScaleParameter,
+    QuantizationStrategy.TENSOR: PerTensorScaleParameter,
+}
+
 
 class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
 
-    def __init__(self, strategy: str, is_static_input_scheme: bool):
-        self.strategy = strategy
+    def __init__(self, weight_quant: QuantizationArgs,
+                 is_static_input_scheme: bool):
+        self.weight_quant = weight_quant
+        self.strategy = weight_quant.strategy
         self.out_dtype = torch.get_default_dtype()
         self.is_static_input_scheme = is_static_input_scheme
         self.act_q_group_shape = GroupShape.PER_TENSOR \
@@ -34,61 +47,84 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
             act_quant_static=self.is_static_input_scheme,
             act_quant_group_shape=self.act_q_group_shape)
 
+        self.weight_block_size = self.weight_quant.block_structure
+        self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
+        self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
+
     @classmethod
     def get_min_capability(cls) -> int:
         # lovelace and up
         return 89
 
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       weight_loader: Callable, **kwargs):
+        maybe_create_device_identity()
+
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.weight_block_size = None
+
+        if self.strategy == QuantizationStrategy.BLOCK:
+            assert self.weight_block_size is not None
+            layer.weight_block_size = self.weight_block_size
+            # Validate block quantization shapes
+            validate_fp8_block_shape(layer, input_size, output_size,
+                                     input_size_per_partition,
+                                     output_partition_sizes,
+                                     self.weight_block_size)
+
+        # WEIGHT
+        weight = create_fp8_weight_parameter(output_size_per_partition,
+                                             input_size_per_partition,
+                                             weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        weight_scale = create_fp8_scale_parameter(
+            strategy_to_parameter_type[self.strategy], output_partition_sizes,
+            input_size_per_partition, layer.weight_block_size, weight_loader)
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE
+        if self.is_static_input_scheme:
+            input_scale = create_fp8_input_scale(output_partition_sizes,
+                                                 weight_loader)
+            layer.register_parameter("input_scale", input_scale)
+
     def process_weights_after_loading(self, layer) -> None:
-        # If per tensor, when we have a fused module (e.g. QKV) with per
-        # tensor scales (thus N scales being passed to the kernel),
-        # requantize so we can always run per tensor
         if self.strategy == QuantizationStrategy.TENSOR:
-            max_w_scale, weight = requantize_with_max_scale(
-                weight=layer.weight,
-                weight_scale=layer.weight_scale,
-                logical_widths=layer.logical_widths,
-            )
+            weight, weight_scale, input_scale = (
+                process_fp8_weight_tensor_strategy(
+                    layer.weight, layer.weight_scale, layer.logical_widths,
+                    getattr(layer, 'input_scale', None)))
+            weight = weight.t()
 
-            if current_platform.is_fp8_fnuz():
-                input_scale = getattr(layer, 'input_scale', None)
-
-                weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
-                    weight=weight,
-                    weight_scale=max_w_scale,
-                    input_scale=input_scale)
-                if input_scale is not None:
-                    layer.input_scale = Parameter(input_scale,
-                                                  requires_grad=False)
-
-            layer.weight = Parameter(weight.t(), requires_grad=False)
-            layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
-
-        # If channelwise, scales are already lined up, so just transpose.
         elif self.strategy == QuantizationStrategy.CHANNEL:
-            weight = layer.weight
+            weight, weight_scale, input_scale = (
+                process_fp8_weight_channel_strategy(
+                    layer.weight, layer.weight_scale,
+                    getattr(layer, 'input_scale', None)))
+            weight = weight.t()
 
-            if current_platform.is_fp8_fnuz():
-                input_scale = getattr(layer, 'input_scale', None)
-
-                weight, weight_scale, input_scale = \
-                    normalize_e4m3fn_to_e4m3fnuz(
-                        weight=weight,
-                        weight_scale=layer.weight_scale,
-                        input_scale=input_scale)
-                if input_scale is not None:
-                    layer.input_scale = Parameter(input_scale,
-                                                  requires_grad=False)
-            else:
-                weight_scale = layer.weight_scale.data
-
-            layer.weight = Parameter(weight.t(), requires_grad=False)
-            # required by torch.compile to be torch.nn.Parameter
-            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        elif self.strategy == QuantizationStrategy.BLOCK:
+            assert self.is_static_input_scheme is False
+            weight, weight_scale = process_fp8_weight_block_strategy(
+                layer.weight, layer.weight_scale)
+            input_scale = None
 
         else:
             raise ValueError(f"Unknown quantization strategy {self.strategy}")
 
+        # required by torch.compile to be torch.nn.Parameter
+        layer.weight = Parameter(weight.data, requires_grad=False)
+        layer.weight_scale = Parameter(weight_scale.data, requires_grad=False)
+        if input_scale is not None:
+            layer.input_scale = Parameter(input_scale.data,
+                                          requires_grad=False)
+
         # INPUT SCALE
         if self.is_static_input_scheme and hasattr(layer, 'input_scale'):
             layer.input_scale = Parameter(layer.input_scale.max(),
@@ -96,58 +132,23 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
         else:
             layer.input_scale = None
 
-    def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: list[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-        maybe_create_device_identity()
-
-        output_size_per_partition = sum(output_partition_sizes)
-        layer.logical_widths = output_partition_sizes
-
-        # WEIGHT
-        weight = ModelWeightParameter(data=torch.empty(
-            output_size_per_partition,
-            input_size_per_partition,
-            dtype=torch.float8_e4m3fn),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
-        layer.register_parameter("weight", weight)
-
-        # WEIGHT SCALE
-        # TODO: update create_xxx_parameter functions to return
-        # the newly added parameters
-        if self.strategy == QuantizationStrategy.CHANNEL:
-            weight_scale = ChannelQuantScaleParameter(
-                data=torch.empty((sum(output_partition_sizes), 1),
-                                 dtype=torch.float32),
-                output_dim=0,
-                weight_loader=weight_loader)
-        else:
-            assert self.strategy == QuantizationStrategy.TENSOR
-            weight_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                   weight_loader=weight_loader)
-
-        # min requirement for fp8 kernels
-        weight_scale[:] = torch.finfo(torch.float32).min
-        layer.register_parameter("weight_scale", weight_scale)
-
-        # INPUT SCALE
-        if self.is_static_input_scheme:
-            input_scale = PerTensorScaleParameter(data=torch.empty(
-                len(output_partition_sizes), dtype=torch.float32),
-                                                  weight_loader=weight_loader)
-            input_scale[:] = torch.finfo(torch.float32).min
-            layer.register_parameter("input_scale", input_scale)
+        if self.strategy == QuantizationStrategy.BLOCK:
+            maybe_post_process_fp8_weight_block(
+                layer, self.cutlass_block_fp8_supported)
 
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
+        if layer.weight_block_size is not None:
+            return apply_fp8_block_linear(
+                layer,
+                input=x,
+                bias=bias,
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
+                use_aiter_and_is_supported=self.use_aiter_and_is_supported)
+
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index e75094c54743c..aec9c79f1ea82 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -4,7 +4,6 @@
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 import torch
-import torch.nn.functional as F
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 
@@ -32,8 +31,12 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
     select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace,
-    should_use_deepgemm_for_fp8_linear)
+    apply_fp8_block_linear, check_aiter_fp8_linear_support,
+    create_fp8_input_scale, create_fp8_scale_parameter,
+    create_fp8_weight_parameter, get_col_major_tma_aligned_tensor,
+    maybe_post_process_fp8_weight_block, process_fp8_weight_block_strategy,
+    process_fp8_weight_tensor_strategy, requant_weight_ue8m0_inplace,
+    validate_fp8_block_shape)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin,
     prepare_moe_fp8_layer_for_marlin)
@@ -42,8 +45,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp, all_close_1d, cutlass_block_fp8_supported,
     cutlass_fp8_supported, maybe_create_device_identity,
-    normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
-    requantize_with_max_scale)
+    normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.parameter import (BlockQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
@@ -233,14 +235,10 @@ class Fp8LinearMethod(LinearMethodBase):
         if current_platform.is_rocm():
             self.use_marlin = False
 
-        # AITER is only supported on ROCm and only for FP8_FNUZ
-        # and at the moment are MI300 series
-        self.use_aiter_and_is_supported = (current_platform.is_rocm()
-                                           and envs.VLLM_ROCM_USE_AITER
-                                           and envs.VLLM_ROCM_USE_AITER_LINEAR
-                                           and current_platform.is_fp8_fnuz())
+        self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
 
-        self.block_quant = self.quant_config.weight_block_size is not None
+        self.weight_block_size = self.quant_config.weight_block_size
+        self.block_quant = self.weight_block_size is not None
         self.act_q_static = self.quant_config.activation_scheme == "static"
         # Use per-token quantization for better perf if dynamic and cutlass
         if not self.act_q_static and cutlass_fp8_supported():
@@ -273,51 +271,27 @@ class Fp8LinearMethod(LinearMethodBase):
         layer.weight_block_size = None
 
         if self.block_quant:
-            tp_size = getattr(layer, "tp_size",
-                              get_tensor_model_parallel_world_size())
-            assert self.quant_config.weight_block_size is not None
-            layer.weight_block_size = self.quant_config.weight_block_size
-            block_n, block_k = (
-                self.quant_config.weight_block_size[0],
-                self.quant_config.weight_block_size[1],
-            )
-            # Required by row parallel
-            if (tp_size > 1
-                    and input_size // input_size_per_partition == tp_size
-                    and input_size_per_partition % block_k != 0):
-                raise ValueError(
-                    f"Weight input_size_per_partition = "
-                    f"{input_size_per_partition} is not divisible by "
-                    f"weight quantization block_k = {block_k}.")
-            # Required by column parallel or enabling merged weights
-            is_tp_split = (tp_size > 1 and
-                           output_size // output_size_per_partition == tp_size)
-            is_merged_gemm = len(output_partition_sizes) > 1
-            if is_tp_split or is_merged_gemm:
-                sizes_to_check = output_partition_sizes
-                if not is_tp_split and is_merged_gemm:
-                    # In case of merged matrices, we allow the last
-                    # matrix to not be a multiple of block size
-                    sizes_to_check = output_partition_sizes[:-1]
-                for output_partition_size in sizes_to_check:
-                    if output_partition_size % block_n != 0:
-                        raise ValueError(
-                            f"Weight output_partition_size = "
-                            f"{output_partition_size} is not divisible by "
-                            f"weight quantization block_n = {block_n}.")
+            assert self.weight_block_size is not None
+            layer.weight_block_size = self.weight_block_size
+            validate_fp8_block_shape(layer, input_size, output_size,
+                                     input_size_per_partition,
+                                     output_partition_sizes,
+                                     self.weight_block_size)
 
         # WEIGHT
-        weight_dtype = (torch.float8_e4m3fn
-                        if self.quant_config.is_checkpoint_fp8_serialized else
-                        params_dtype)
-
-        weight = ModelWeightParameter(data=torch.empty(
-            output_size_per_partition,
-            input_size_per_partition,
-            dtype=weight_dtype),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            weight = create_fp8_weight_parameter(output_size_per_partition,
+                                                 input_size_per_partition,
+                                                 weight_loader)
+        else:
+            # For non-serialized checkpoints, use original dtype
+            weight = ModelWeightParameter(data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=params_dtype),
+                                          input_dim=1,
+                                          output_dim=0,
+                                          weight_loader=weight_loader)
         layer.register_parameter("weight", weight)
 
         # If checkpoint is serialized fp8, load them.
@@ -325,154 +299,87 @@ class Fp8LinearMethod(LinearMethodBase):
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
             if not self.block_quant:
-                scale = PerTensorScaleParameter(
-                    data=torch.empty(len(output_partition_sizes),
-                                     dtype=torch.float32),
-                    weight_loader=weight_loader,
-                )
-                scale[:] = torch.finfo(torch.float32).min
+                scale = create_fp8_scale_parameter(PerTensorScaleParameter,
+                                                   output_partition_sizes,
+                                                   input_size_per_partition,
+                                                   None, weight_loader)
                 set_weight_attrs(scale, {"scale_type": "weight_scale"})
                 layer.register_parameter("weight_scale", scale)
             else:
-                assert self.quant_config.activation_scheme == "dynamic"
-                scale = BlockQuantScaleParameter(
-                    data=torch.empty(
-                        (output_size_per_partition + block_n - 1) // block_n,
-                        (input_size_per_partition + block_k - 1) // block_k,
-                        dtype=torch.float32,
-                    ),
-                    input_dim=1,
-                    output_dim=0,
-                    weight_loader=weight_loader,
-                )
-                scale[:] = torch.finfo(torch.float32).min
+                assert not self.act_q_static
+                assert self.weight_block_size is not None
+                scale = create_fp8_scale_parameter(BlockQuantScaleParameter,
+                                                   output_partition_sizes,
+                                                   input_size_per_partition,
+                                                   self.weight_block_size,
+                                                   weight_loader)
                 set_weight_attrs(scale, {"scale_type": "weight_scale"})
                 # The weight_scale_inv name is intentional for deepseekv3
                 layer.register_parameter("weight_scale_inv", scale)
 
             # INPUT ACTIVATION SCALE
-            if self.quant_config.activation_scheme == "static":
-                scale = PerTensorScaleParameter(data=torch.empty(
-                    len(output_partition_sizes), dtype=torch.float32),
-                                                weight_loader=weight_loader)
-
-                scale[:] = torch.finfo(torch.float32).min
+            if self.act_q_static:
+                scale = create_fp8_input_scale(output_partition_sizes,
+                                               weight_loader)
                 set_weight_attrs(scale, {"scale_type": "input_scale"})
                 layer.register_parameter("input_scale", scale)
             else:
                 layer.register_parameter("input_scale", None)
 
-    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
-        # Pad the weight tensor. This is an optimization on ROCm platform, which
-        # can benefit from tensors located far enough from one another in memory
-        if (envs.VLLM_ROCM_FP8_PADDING and current_platform.is_rocm()
-                and weight.stride(-1) == 1
-                and (weight.stride(-2) * weight.element_size()) % 512 == 0):
-            num_pad = 256 // weight.element_size()
-            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
-            torch.cuda.empty_cache()
-        return weight
-
     def process_weights_after_loading(self, layer: Module) -> None:
         size_k_first = True
+        input_scale = None
         # TODO(rob): refactor block quant into separate class.
         if self.block_quant:
-            assert self.quant_config.activation_scheme == "dynamic"
+            assert not self.act_q_static
             size_k_first = False
-            if current_platform.is_fp8_fnuz():
-                weight, weight_scale_inv, _ = \
-                    normalize_e4m3fn_to_e4m3fnuz(
-                        weight=layer.weight,
-                        weight_scale=layer.weight_scale_inv)
-            else:
-                weight = layer.weight.data
-                weight_scale_inv = layer.weight_scale_inv.data
 
-            weight = self._maybe_pad_weight(weight)
-
-            # Torch.compile cannot use Parameter subclasses.
-            layer.weight = Parameter(weight, requires_grad=False)
-            layer.weight_scale_inv = Parameter(weight_scale_inv,
-                                               requires_grad=False)
+            weight, weight_scale = process_fp8_weight_block_strategy(
+                layer.weight, layer.weight_scale_inv)
+            # Delete the weight_scale_inv parameter to avoid confusion
+            # with the weight_scale parameter
+            del layer.weight_scale_inv
 
         # If checkpoint not serialized fp8, quantize the weights.
         elif not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
                                                          scale=None)
+            weight = qweight.t()
 
-            # Update the layer with the new values.
-            layer.weight = Parameter(qweight.t(), requires_grad=False)
-            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-            # layer.input_scale is None indicates dynamic quant and scale is
-            # computed from input.
-            layer.input_scale = None
-
-        # If checkpoint is fp8, handle that there are N scales for N
+        # If checkpoint is fp8 per-tensor, handle that there are N scales for N
         # shards in a fused module
         else:
-            layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
-                                                    requires_grad=False)
-            if self.quant_config.activation_scheme == "static":
-                layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
-                                                       requires_grad=False)
-
             weight = layer.weight
             weight_scale = layer.weight_scale
 
             # If using w8a8, torch._scaled_mm needs per tensor, so
             # requantize the logical shards as a single weight.
             if not self.use_marlin:
-                # Dequant -> Quant with max scale so we can run per tensor.
-                if current_platform.is_fp8_fnuz():
-                    weight, weight_scale, input_scale = \
-                        normalize_e4m3fn_to_e4m3fnuz(
-                            weight=weight,
-                            weight_scale=weight_scale,
-                            input_scale=layer.input_scale)
-                    if input_scale is not None:
-                        layer.input_scale = Parameter(input_scale,
-                                                      requires_grad=False)
+                weight, weight_scale, input_scale = (
+                    process_fp8_weight_tensor_strategy(
+                        weight, weight_scale, layer.logical_widths,
+                        getattr(layer, 'input_scale', None)))
+                if self.act_q_static:
+                    assert input_scale is not None
+                    input_scale = input_scale.max()
+            weight = weight.t()
 
-                weight_scale, weight = requantize_with_max_scale(
-                    weight=weight,
-                    weight_scale=weight_scale,
-                    logical_widths=layer.logical_widths,
-                )
-
-            weight = self._maybe_pad_weight(weight)
-            # Update layer with new values.
-            layer.weight = Parameter(weight.t(), requires_grad=False)
-            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-            if self.quant_config.activation_scheme == "static":
-                layer.input_scale = Parameter(layer.input_scale.max(),
-                                              requires_grad=False)
+        # Update layer with new values.
+        layer.weight = Parameter(weight.data, requires_grad=False)
+        layer.weight_scale = Parameter(weight_scale.data, requires_grad=False)
+        layer.input_scale = Parameter(
+            input_scale,
+            requires_grad=False) if input_scale is not None else None
 
         if self.use_marlin:
             prepare_fp8_layer_for_marlin(layer, size_k_first)
             # Activations not quantized for marlin.
             del layer.input_scale
+            return
 
-        # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
-        # requantize the weight and input to the specific scale
-        # at the same time.
-        if is_deep_gemm_e8m0_used() and self.block_quant:
-            assert layer.weight_block_size is not None
-            block_sz = tuple(layer.weight_block_size)
-            requant_weight_ue8m0_inplace(
-                layer.weight.data,
-                layer.weight_scale_inv.data if hasattr(
-                    layer, "weight_scale_inv") else layer.weight_scale.data,
-                block_sz,
-            )
-
-        # SM90 Block FP8 CUTLASS requires row-major weight scales
-        if (self.block_quant and current_platform.is_device_capability(90)
-                and self.cutlass_block_fp8_supported
-                and not should_use_deepgemm_for_fp8_linear(
-                    torch.bfloat16, layer.weight)):
-            layer.weight_scale_inv = Parameter(
-                layer.weight_scale_inv.data.T.contiguous(),
-                requires_grad=False)
+        if self.block_quant:
+            maybe_post_process_fp8_weight_block(
+                layer, self.cutlass_block_fp8_supported)
 
     def apply(self,
               layer: torch.nn.Module,
@@ -490,18 +397,12 @@ class Fp8LinearMethod(LinearMethodBase):
                 bias=bias)
 
         if self.block_quant:
-            assert self.quant_config.weight_block_size is not None
-
-            return torch.ops.vllm.apply_w8a8_block_fp8_linear(
+            return apply_fp8_block_linear(
+                layer,
                 input=x,
-                weight=layer.weight,
-                block_size=self.quant_config.weight_block_size,
-                weight_scale=layer.weight_scale_inv,
-                input_scale=layer.input_scale,
                 bias=bias,
                 cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
-                use_aiter_and_is_supported=self.use_aiter_and_is_supported,
-            )
+                use_aiter_and_is_supported=self.use_aiter_and_is_supported)
 
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
@@ -528,7 +429,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         super().__init__(layer.moe_config)
         self.layer = layer
         self.quant_config = quant_config
-        self.block_quant = self.quant_config.weight_block_size is not None
+        self.weight_block_size = self.quant_config.weight_block_size
+        self.block_quant = self.weight_block_size is not None
 
         self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None
         self.fused_experts: Optional[
@@ -590,12 +492,12 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         if self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = torch.float8_e4m3fn
         if self.block_quant:
-            assert self.quant_config.weight_block_size is not None
-            layer.weight_block_size = self.quant_config.weight_block_size
+            assert self.weight_block_size is not None
+            layer.weight_block_size = self.weight_block_size
             tp_size = get_tensor_model_parallel_world_size()
             block_n, block_k = (
-                self.quant_config.weight_block_size[0],
-                self.quant_config.weight_block_size[1],
+                self.weight_block_size[0],
+                self.weight_block_size[1],
             )
             # NOTE: To ensure proper alignment of the block-wise quantization
             # scales, the output_size of the weights for both the gate and up
@@ -952,7 +854,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 "BatchedTritonOrDeepGemmExperts(%s): "
                 "max_tokens_per_rank=%s, block_size=%s, per_act_token=%s",
                 self.__class__.__name__, max_num_tokens_per_rank,
-                self.quant_config.weight_block_size, False)
+                self.weight_block_size, False)
             return BatchedTritonOrDeepGemmExperts(
                 max_num_tokens=max_num_tokens_per_rank,
                 num_dispatchers=prepare_finalize.num_dispatchers(),
@@ -969,8 +871,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         else:
             logger.debug(
                 "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s",
-                self.__class__.__name__, self.quant_config.weight_block_size,
-                False)
+                self.__class__.__name__, self.weight_block_size, False)
             return TritonOrDeepGemmExperts(
                 quant_config=self.moe_quant_config,
                 allow_deep_gemm=self.allow_deep_gemm,
@@ -988,7 +889,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                       if self.block_quant else layer.w2_weight_scale),
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
-            block_shape=self.quant_config.weight_block_size,
+            block_shape=self.weight_block_size,
         )
 
     def apply(
@@ -1046,7 +947,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     intermediate_size=layer.intermediate_size_per_partition,
                     expert_offset=layer.ep_rank * layer.local_num_experts,
                     local_num_experts=layer.local_num_experts,
-                    block_shape=self.quant_config.weight_block_size,
+                    block_shape=self.weight_block_size,
                     routed_scaling=routed_scaling_factor,
                 )
             else:
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index bbe0c6f6d38ec..fc12483de0c0e 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -17,6 +17,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
     group_broadcast)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_BLOCK_FP8_SUPPORTED)
+from vllm.model_executor.parameter import (BlockQuantScaleParameter,
+                                           ChannelQuantScaleParameter,
+                                           PerTensorScaleParameter)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv, direct_register_custom_op
@@ -794,3 +797,220 @@ def requant_weight_ue8m0_inplace(
         # Write back the results in-place.
         w_q.copy_(w_requant)
         s_old.copy_(s_requant)
+
+
+def check_aiter_fp8_linear_support() -> bool:
+    """AITER is only supported on ROCm and only for FP8_FNUZ
+    and at the moment are MI300 series"""
+    return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER
+            and envs.VLLM_ROCM_USE_AITER_LINEAR
+            and current_platform.is_fp8_fnuz())
+
+
+def _maybe_pad_fp8_weight(weight: torch.Tensor) -> torch.Tensor:
+    """Pad the weight tensor. This is an optimization on ROCm platform, which
+    can benefit from tensors located far enough from one another in memory"""
+    if (envs.VLLM_ROCM_FP8_PADDING and current_platform.is_rocm()
+            and weight.stride(-1) == 1
+            and (weight.stride(-2) * weight.element_size()) % 512 == 0):
+        num_pad = 256 // weight.element_size()
+        import torch.nn.functional as F
+        weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
+        torch.cuda.empty_cache()
+    return weight
+
+
+def validate_fp8_block_shape(layer: torch.nn.Module, input_size: int,
+                             output_size: int, input_size_per_partition: int,
+                             output_partition_sizes: list[int],
+                             block_size: list[int]) -> None:
+    """Validate block quantization shapes for tensor parallelism."""
+    from vllm.distributed import get_tensor_model_parallel_world_size
+
+    tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size())
+    block_n, block_k = block_size[0], block_size[1]
+
+    # Required by row parallel
+    if (tp_size > 1 and input_size // input_size_per_partition == tp_size
+            and input_size_per_partition % block_k != 0):
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition} "
+            f"is not divisible by weight quantization block_k = {block_k}.")
+
+    # Required by column parallel or enabling merged weights
+    is_tp_split = (tp_size > 1
+                   and output_size // sum(output_partition_sizes) == tp_size)
+    is_merged_gemm = len(output_partition_sizes) > 1
+    if is_tp_split or is_merged_gemm:
+        sizes_to_check = output_partition_sizes
+        if not is_tp_split and is_merged_gemm:
+            # In case of merged matrices, we allow the last
+            # matrix to not be a multiple of block size
+            sizes_to_check = output_partition_sizes[:-1]
+        for output_partition_size in sizes_to_check:
+            if output_partition_size % block_n != 0:
+                raise ValueError(
+                    f"Weight output_partition_size = "
+                    f"{output_partition_size} is not divisible by "
+                    f"weight quantization block_n = {block_n}.")
+
+
+def create_fp8_weight_parameter(
+        output_size_per_partition: int, input_size_per_partition: int,
+        weight_loader: Optional[Callable]) -> torch.nn.Parameter:
+    """Create FP8 weight parameter."""
+    from vllm.model_executor.parameter import ModelWeightParameter
+
+    return ModelWeightParameter(data=torch.empty(output_size_per_partition,
+                                                 input_size_per_partition,
+                                                 dtype=torch.float8_e4m3fn),
+                                input_dim=1,
+                                output_dim=0,
+                                weight_loader=weight_loader)
+
+
+def create_fp8_scale_parameter(
+        parameter_type: torch.nn.Parameter, output_partition_sizes: list[int],
+        input_size_per_partition: int, block_size: Optional[list[int]],
+        weight_loader: Optional[Callable]) -> torch.nn.Parameter:
+    """Create scale parameter based on quantization strategy."""
+    if parameter_type == ChannelQuantScaleParameter:
+        scale = parameter_type(data=torch.empty(
+            (sum(output_partition_sizes), 1), dtype=torch.float32),
+                               output_dim=0,
+                               weight_loader=weight_loader)
+    elif parameter_type == BlockQuantScaleParameter:
+        assert block_size is not None
+        block_n, block_k = block_size[0], block_size[1]
+        output_size_per_partition = sum(output_partition_sizes)
+        scale = parameter_type(
+            data=torch.empty(
+                (output_size_per_partition + block_n - 1) // block_n,
+                (input_size_per_partition + block_k - 1) // block_k,
+                dtype=torch.float32,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+    elif parameter_type == PerTensorScaleParameter:
+        scale = parameter_type(data=torch.empty(len(output_partition_sizes),
+                                                dtype=torch.float32),
+                               weight_loader=weight_loader)
+    else:
+        raise ValueError(f"Unknown parameter type: {parameter_type}")
+
+    scale[:] = torch.finfo(torch.float32).min
+    return scale
+
+
+def create_fp8_input_scale(
+        output_partition_sizes: list[int],
+        weight_loader: Optional[Callable]) -> torch.nn.Parameter:
+    """Create input scale parameter for static activation quantization."""
+    from vllm.model_executor.parameter import PerTensorScaleParameter
+
+    scale = PerTensorScaleParameter(data=torch.empty(
+        len(output_partition_sizes), dtype=torch.float32),
+                                    weight_loader=weight_loader)
+    scale[:] = torch.finfo(torch.float32).min
+    return scale
+
+
+def process_fp8_weight_tensor_strategy(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    logical_widths: list[int],
+    input_scale: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    """Process weights for tensor-wise quantization strategy."""
+    from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+        normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale)
+
+    if current_platform.is_fp8_fnuz():
+        weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+            weight=weight, weight_scale=weight_scale, input_scale=input_scale)
+
+    # Requantize with max scale
+    weight_scale, weight = requantize_with_max_scale(
+        weight=weight,
+        weight_scale=weight_scale,
+        logical_widths=logical_widths,
+    )
+
+    weight = _maybe_pad_fp8_weight(weight)
+    return weight, weight_scale, input_scale
+
+
+def process_fp8_weight_channel_strategy(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    """Process weights for channel-wise quantization strategy."""
+    from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+        normalize_e4m3fn_to_e4m3fnuz)
+
+    if current_platform.is_fp8_fnuz():
+        weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+            weight=weight, weight_scale=weight_scale, input_scale=input_scale)
+
+    return weight, weight_scale, input_scale
+
+
+def process_fp8_weight_block_strategy(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Process weights for block-wise quantization strategy."""
+    from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+        normalize_e4m3fn_to_e4m3fnuz)
+
+    if current_platform.is_fp8_fnuz():
+        weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+            weight=weight, weight_scale=weight_scale)
+
+    weight = _maybe_pad_fp8_weight(weight)
+    return weight, weight_scale
+
+
+def maybe_post_process_fp8_weight_block(layer: torch.nn.Module,
+                                        cutlass_block_fp8_supported: bool):
+    assert layer.weight_block_size is not None
+
+    from vllm.utils.deep_gemm import (is_deep_gemm_e8m0_used,
+                                      should_use_deepgemm_for_fp8_linear)
+
+    # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
+    # requantize the weight and input to the specific scale
+    # at the same time.
+    if is_deep_gemm_e8m0_used():
+        block_sz = tuple(layer.weight_block_size)
+        requant_weight_ue8m0_inplace(layer.weight.data,
+                                     layer.weight_scale.data, block_sz)
+    # SM90 Block FP8 CUTLASS requires row-major weight scales
+    elif (current_platform.is_device_capability(90)
+          and cutlass_block_fp8_supported
+          and not should_use_deepgemm_for_fp8_linear(torch.bfloat16,
+                                                     layer.weight)):
+        layer.weight_scale = torch.nn.Parameter(
+            layer.weight_scale.data.T.contiguous(), requires_grad=False)
+
+
+def apply_fp8_block_linear(layer: torch.nn.Module, input: torch.Tensor,
+                           bias: Optional[torch.Tensor],
+                           cutlass_block_fp8_supported: bool,
+                           use_aiter_and_is_supported: bool) -> torch.Tensor:
+    """Apply block-wise FP8 linear operation."""
+    assert layer.weight_block_size is not None
+
+    return torch.ops.vllm.apply_w8a8_block_fp8_linear(
+        input=input,
+        weight=layer.weight,
+        block_size=layer.weight_block_size,
+        weight_scale=layer.weight_scale,
+        input_scale=layer.input_scale,
+        bias=bias,
+        cutlass_block_fp8_supported=cutlass_block_fp8_supported,
+        use_aiter_and_is_supported=use_aiter_and_is_supported,
+    )

From bc19d7598566ae81b3f69b43cbc2bd34aa5497c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 18 Sep 2025 15:56:07 +0200
Subject: [PATCH 35/58] [Misc] Add kv-connector label (#25156)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .github/mergify.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 94198b1251e09..75ee3e3c55b46 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -302,3 +302,20 @@ pull_request_rules:
     label:
       remove:
         - needs-rebase
+
+- name: label-kv-connector
+  description: Automatically apply kv-connector label
+  conditions:
+    - or:
+      - files~=^examples/online_serving/disaggregated[^/]*/.*
+      - files~=^examples/offline_inference/disaggregated[^/]*/.*
+      - files~=^examples/others/lmcache/
+      - files~=^tests/v1/kv_connector/
+      - files~=^vllm/distributed/kv_transfer/
+      - title~=(?i)\bP/?D\b
+      - title~=(?i)NIXL
+      - title~=(?i)LMCache
+  actions:
+    label:
+      add:
+        - kv-connector
\ No newline at end of file

From 01a583fea40571986ffe277549e5bb441d409768 Mon Sep 17 00:00:00 2001
From: jvlunteren <161835099+jvlunteren@users.noreply.github.com>
Date: Thu, 18 Sep 2025 16:27:01 +0200
Subject: [PATCH 36/58] [Kernel] Decouple Tile Size from Block Size in Triton
 Unified Attention Kernel (#21197)

Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com>
---
 .../test_triton_unified_attention.py          |   3 -
 .../attention/ops/triton_unified_attention.py | 122 ++++++++++--------
 2 files changed, 70 insertions(+), 55 deletions(-)

diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index 4b97d51e6ed21..ab91560e995c8 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -102,9 +102,6 @@ def test_triton_unified_attn(
 ) -> None:
     torch.set_default_device("cuda")
 
-    if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32:
-        pytest.skip("block size must be at least 32 for fp8")
-
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index d2ad2f7e8d2aa..591b68bfa6468 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -73,6 +73,7 @@ def kernel_unified_attention_2d(
     output_stride_1: tl.int64,  # int, should be equal to head_size
     qq_bias_stride_0: tl.int64,  # int
     BLOCK_SIZE: tl.constexpr,  # int
+    TILE_SIZE: tl.constexpr,  # int must be power of 2
     HEAD_SIZE: tl.constexpr,  # int
     HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
     USE_ALIBI_SLOPES: tl.constexpr,  # bool
@@ -118,6 +119,7 @@ def kernel_unified_attention_2d(
 
     offs_m = tl.arange(0, BLOCK_M)
     offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+    offs_t = tl.arange(0, TILE_SIZE)
     query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
 
     query_offset_0 = cur_batch_in_all_start_index + query_pos
@@ -177,31 +179,32 @@ def kernel_unified_attention_2d(
     # actual sequence length
     max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
 
-    # calculate the number of tiles (blocks) that need to be processed to
-    # cover the longest sequence prefix (due to causal masking, blocks beyond
+    # calculate the number of tiles that need to be processed to
+    # cover the longest sequence prefix (due to causal masking, tiles beyond
     # this prefix can be skipped)
-    num_blocks = cdiv_fn(max_seq_prefix_len, BLOCK_SIZE)
+    num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE)
 
     # iterate through tiles
-    for j in range(0, num_blocks):
+    for j in range(0, num_tiles):
+        seq_offset = j * TILE_SIZE + offs_t
+        tile_mask = seq_offset < max_seq_prefix_len
 
-        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset +
+                                     seq_offset // BLOCK_SIZE).to(tl.int64)
 
-        offs_n = tl.arange(0, BLOCK_SIZE)
-
-        v_offset = (physical_block_idx * stride_v_cache_0 +
+        v_offset = (physical_block_idx[:, None] * stride_v_cache_0 +
                     kv_head_idx * stride_v_cache_2 +
                     offs_d[None, :] * stride_v_cache_3 +
-                    offs_n[:, None] * stride_v_cache_1)
+                    (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1)
 
-        k_offset = (physical_block_idx * stride_k_cache_0 +
+        k_offset = (physical_block_idx[None, :] * stride_k_cache_0 +
                     kv_head_idx * stride_k_cache_2 +
                     offs_d[:, None] * stride_k_cache_3 +
-                    offs_n[None, :] * stride_k_cache_1)
+                    (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1)
 
-        # K : (HEAD_SIZE, BLOCK_SIZE)
+        # K : (HEAD_SIZE, TILE_SIZE)
         K_load = tl.load(key_cache_ptr + k_offset,
-                         mask=dim_mask[:, None],
+                         mask=dim_mask[:, None] & tile_mask[None, :],
                          other=0.0)
 
         if K_load.dtype.is_fp8():
@@ -212,9 +215,9 @@ def kernel_unified_attention_2d(
         else:
             K = K_load
 
-        # V : (BLOCK_SIZE, HEAD_SIZE)
+        # V : (TILE_SIZE, HEAD_SIZE)
         V_load = tl.load(value_cache_ptr + v_offset,
-                         mask=dim_mask[None, :],
+                         mask=dim_mask[None, :] & tile_mask[:, None],
                          other=0.0)
 
         if V_load.dtype.is_fp8():
@@ -225,12 +228,10 @@ def kernel_unified_attention_2d(
         else:
             V = V_load
 
-        seq_offset = j * BLOCK_SIZE + offs_n
-
         seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
 
-        # S : (BLOCK_M, BLOCK_SIZE)
-        S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32)
+        # S : (BLOCK_M, TILE_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32)
 
         S += scale * tl.dot(Q, K)
 
@@ -262,11 +263,12 @@ def kernel_unified_attention_2d(
         # compute running maximum
         # m_j : (BLOCK_M,)
         m_j = tl.maximum(M, tl.max(S, axis=1))
+
         # For sliding window there's a chance the max is -inf due to masking of
         # the entire row. In this case we need to set m_j 0 to avoid NaN
         m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
 
-        # P : (BLOCK_M, BLOCK_SIZE)
+        # P : (BLOCK_M, TILE_SIZE)
         P = tl.exp(S - m_j[:, None])
 
         # l_j : (BLOCK_M,)
@@ -327,6 +329,7 @@ def kernel_unified_attention_3d(
         query_stride_1: tl.int64,  # int, should be equal to head_size
         qq_bias_stride_0: tl.int64,  # int
         BLOCK_SIZE: tl.constexpr,  # int
+        TILE_SIZE: tl.constexpr,  # int, must be power of 2
         HEAD_SIZE: tl.constexpr,  # int
         HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
         USE_ALIBI_SLOPES: tl.constexpr,  # bool
@@ -374,20 +377,19 @@ def kernel_unified_attention_3d(
 
     # number of segments for this particular sequence
     num_segments = NUM_SEGMENTS_PER_SEQ
-    blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE)
+    tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE)
 
-    if segm_idx * blocks_per_segment * BLOCK_SIZE >= seq_len:
+    if segm_idx * tiles_per_segment * TILE_SIZE >= seq_len:
         return
 
     offs_m = tl.arange(0, BLOCK_M)
     offs_d = tl.arange(0, HEAD_SIZE_PADDED)
-
+    offs_t = tl.arange(0, TILE_SIZE)
     query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv
 
     query_offset_0 = cur_batch_in_all_start_index + query_pos
     query_offset_1 = kv_head_idx * num_queries_per_kv + \
         offs_m % num_queries_per_kv
-
     query_offset = (query_offset_0[:, None] * query_stride_0 +
                     query_offset_1[:, None] * query_stride_1 + offs_d[None, :])
 
@@ -433,30 +435,44 @@ def kernel_unified_attention_3d(
         qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0
                             )  # shape: [BLOCK_M]
 
-    num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
+    # compute the length of the longest sequence prefix spanned by any
+    # query token in the current q_block (q_block_local_idx)
+    max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + (
+        BLOCK_M - 1) // num_queries_per_kv + 1
+
+    # adjust for potential padding in the last q_block by considering the
+    # actual sequence length
+    max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len)
+
+    # calculate the number of tiles that need to be processed to
+    # cover the longest sequence prefix (due to causal masking, tiles beyond
+    # this prefix can be skipped)
+    num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE)
 
     # iterate through tiles within current segment
     for j in range(
-            segm_idx * blocks_per_segment,
-            min((segm_idx + 1) * blocks_per_segment, num_blocks),
+            segm_idx * tiles_per_segment,
+            min((segm_idx + 1) * tiles_per_segment, num_tiles),
     ):
-        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
+        seq_offset = j * TILE_SIZE + offs_t
+        tile_mask = seq_offset < max_seq_prefix_len
 
-        offs_n = tl.arange(0, BLOCK_SIZE)
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset +
+                                     seq_offset // BLOCK_SIZE).to(tl.int64)
 
-        v_offset = (physical_block_idx * stride_v_cache_0 +
+        v_offset = (physical_block_idx[:, None] * stride_v_cache_0 +
                     kv_head_idx * stride_v_cache_2 +
                     offs_d[None, :] * stride_v_cache_3 +
-                    offs_n[:, None] * stride_v_cache_1)
+                    (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1)
 
-        k_offset = (physical_block_idx * stride_k_cache_0 +
+        k_offset = (physical_block_idx[None, :] * stride_k_cache_0 +
                     kv_head_idx * stride_k_cache_2 +
                     offs_d[:, None] * stride_k_cache_3 +
-                    offs_n[None, :] * stride_k_cache_1)
+                    (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1)
 
-        # K : (HEAD_SIZE, BLOCK_SIZE)
+        # K : (HEAD_SIZE, TILE_SIZE)
         K_load = tl.load(key_cache_ptr + k_offset,
-                         mask=dim_mask[:, None],
+                         mask=dim_mask[:, None] & tile_mask[None, :],
                          other=0.0)
 
         if K_load.dtype.is_fp8():
@@ -467,9 +483,9 @@ def kernel_unified_attention_3d(
         else:
             K = K_load
 
-        # V : (BLOCK_SIZE, HEAD_SIZE)
+        # V : (TILE_SIZE, HEAD_SIZE)
         V_load = tl.load(value_cache_ptr + v_offset,
-                         mask=dim_mask[None, :],
+                         mask=dim_mask[None, :] & tile_mask[:, None],
                          other=0.0)
 
         if V_load.dtype.is_fp8():
@@ -480,13 +496,10 @@ def kernel_unified_attention_3d(
         else:
             V = V_load
 
-        seq_offset = j * BLOCK_SIZE + offs_n
-
         seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1
 
-        # S : (BLOCK_M, BLOCK_SIZE)
-        S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32)
-
+        # S : (BLOCK_M, TILE_SIZE)
+        S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32)
         S += scale * tl.dot(Q, K)
 
         if USE_SOFTCAP:
@@ -517,11 +530,12 @@ def kernel_unified_attention_3d(
         # compute running maximum
         # m_j : (BLOCK_M,)
         m_j = tl.maximum(M, tl.max(S, axis=1))
+
         # For sliding window there's a chance the max is -inf due to masking of
         # the entire row. In this case we need to set m_j 0 to avoid NaN
         m_j = tl.where(m_j > float("-inf"), m_j, 0.0)
 
-        # P : (BLOCK_M, BLOCK_SIZE,)
+        # P : (BLOCK_M, TILE_SIZE,)
         P = tl.exp(S - m_j[:, None])
 
         # l_j : (BLOCK_M,)
@@ -573,7 +587,7 @@ def reduce_segments(
     output_stride_0: tl.int64,  # int
     output_stride_1: tl.int64,  # int, should be equal to head_size
     block_table_stride: tl.int64,  # int
-    BLOCK_SIZE: tl.constexpr,  # int
+    TILE_SIZE: tl.constexpr,  # int
     HEAD_SIZE: tl.constexpr,  # int, must be power of 2
     HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
     query_start_len_ptr,  # [num_seqs+1]
@@ -594,10 +608,10 @@ def reduce_segments(
 
     # number of segments for this particular sequence
     num_segments = NUM_SEGMENTS_PER_SEQ
-    blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE)
+    tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE)
 
     # create masks for subsequent loads
-    act_num_segments = cdiv_fn(seq_len, blocks_per_segment * BLOCK_SIZE)
+    act_num_segments = cdiv_fn(seq_len, tiles_per_segment * TILE_SIZE)
     segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full(
         [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32)
     dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
@@ -671,13 +685,10 @@ def unified_attention(
     # Optional tensor for sinks
     sinks=None,
 ):
+
     assert causal, "Only causal attention is supported"
     assert q_descale is None, "Q scales not supported"
 
-    block_size = v.shape[1]
-    assert q.element_size() >= 2 or block_size >= 32, \
-        "Block size must be at least 32 for fp8"
-
     if sinks is not None:
         assert sinks.shape[0] == q.shape[1], \
         "Sinks must be num_query_heads size"
@@ -707,6 +718,12 @@ def unified_attention(
     #    = floor(q.shape[0] / BLOCK_Q) + num_seqs
     total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs
 
+    # Assigning default tile sizes for prefill and decode.
+    # Note: each tile size must be at least 32 for "fp8" (q.element_size() == 1)
+    # and at least 16 for all other data types.
+    TILE_SIZE_PREFILL = 32
+    TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32
+
     # if batch contains a prefill
     if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128:
         kernel_unified_attention_2d[(
@@ -736,6 +753,7 @@ def unified_attention(
             output_stride_1=out.stride(1),
             qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
             BLOCK_SIZE=block_size,
+            TILE_SIZE=TILE_SIZE_PREFILL,
             HEAD_SIZE=head_size,
             HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
             USE_ALIBI_SLOPES=use_alibi_slopes,
@@ -809,6 +827,7 @@ def unified_attention(
                 query_stride_1=q.stride(1),
                 qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0,
                 BLOCK_SIZE=block_size,
+                TILE_SIZE=TILE_SIZE_DECODE,
                 HEAD_SIZE=head_size,
                 HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
                 USE_ALIBI_SLOPES=use_alibi_slopes,
@@ -830,7 +849,6 @@ def unified_attention(
                 BLOCK_M=BLOCK_M,
                 NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS,
             )
-
         reduce_segments[(q.shape[0], num_query_heads)](
             output_ptr=out,
             segm_output_ptr=segm_output,
@@ -844,7 +862,7 @@ def unified_attention(
             output_stride_0=out.stride(0),
             output_stride_1=out.stride(1),
             block_table_stride=block_table.stride(0),
-            BLOCK_SIZE=block_size,
+            TILE_SIZE=TILE_SIZE_DECODE,
             HEAD_SIZE=head_size,
             HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
             query_start_len_ptr=cu_seqlens_q,

From 072d7e53e534d337b41262dd44ded9b44aa699ef Mon Sep 17 00:00:00 2001
From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Date: Thu, 18 Sep 2025 18:27:49 +0400
Subject: [PATCH 37/58] [PERF] Add `conv1d` metadata to GDN attn (#25105)

Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
---
 vllm/model_executor/layers/mamba/mamba2_metadata.py |  8 +++++---
 vllm/model_executor/models/qwen3_next.py            | 10 +++++++++-
 vllm/v1/attention/backends/gdn_attn.py              |  6 ++++++
 vllm/v1/attention/backends/mamba2_attn.py           |  4 ++--
 vllm/v1/attention/backends/short_conv_attn.py       |  4 ++--
 5 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py
index 368bfe3af1d3f..c926e17a2c197 100644
--- a/vllm/model_executor/layers/mamba/mamba2_metadata.py
+++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -11,6 +11,7 @@ from vllm.attention.backends.placeholder_attn import (
     PlaceholderAttentionMetadata)
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.platforms import current_platform
+from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
 from vllm.v1.attention.backends.mamba2_attn import (
     Mamba2AttentionMetadata, _query_start_loc_to_chunk_indices_offsets)
 
@@ -45,8 +46,8 @@ class Mamba2Metadata:
     """
     nums_dict: Optional[dict] = None
     cu_seqlen: Optional[int] = None
-    batch_ptr: Optional[torch.tensor] = None
-    token_chunk_offset_ptr: Optional[torch.tensor] = None
+    batch_ptr: Optional[torch.Tensor] = None
+    token_chunk_offset_ptr: Optional[torch.Tensor] = None
 
 
 def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]:
@@ -117,7 +118,8 @@ def prepare_mamba2_metadata(
 
 def update_metadata(x: torch.Tensor, query_start_loc: torch.Tensor,
                     mamba2_metadata: Union[Mamba2Metadata,
-                                           Mamba2AttentionMetadata]):
+                                           Mamba2AttentionMetadata,
+                                           GDNAttentionMetadata]):
     """
     this is triggered upon handling a new input at the first layer
     """
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index eb060cb90f44c..0c974ee44eee2 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -35,6 +35,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.mamba2_metadata import update_metadata
 from vllm.model_executor.layers.mamba.mamba_mixer2 import (
     mamba_v2_sharded_weight_loader)
 from vllm.model_executor.layers.mamba.mamba_utils import (
@@ -414,6 +415,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
 
         assert isinstance(attn_metadata, dict)
         attn_metadata = attn_metadata[self.prefix]
+        conv_metadata = attn_metadata
         assert isinstance(attn_metadata, GDNAttentionMetadata)
         has_initial_state = attn_metadata.has_initial_state
         spec_query_start_loc = attn_metadata.spec_query_start_loc
@@ -475,10 +477,15 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
 
         # 2.2: process the remaining part
         if attn_metadata.num_prefills > 0:
+            mixed_qkv_non_spec_T = mixed_qkv_non_spec.transpose(0, 1)
+            if conv_metadata.cu_seqlen is None:
+                conv_metadata = update_metadata(mixed_qkv_non_spec_T,
+                                                non_spec_query_start_loc,
+                                                conv_metadata)
             # - "cache_indices" updates the conv_state cache in positions
             #   pointed to by "mamba_cache_params.state_indices_tensor"
             mixed_qkv_non_spec = causal_conv1d_fn(
-                mixed_qkv_non_spec.transpose(0, 1),
+                mixed_qkv_non_spec_T,
                 conv_weights,
                 self.conv1d.bias,
                 activation=self.activation,
@@ -486,6 +493,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
                 has_initial_state=has_initial_state,
                 cache_indices=non_spec_state_indices_tensor,
                 query_start_loc=non_spec_query_start_loc,
+                metadata=conv_metadata,
             ).transpose(0, 1)
         elif attn_metadata.num_decodes > 0:
             mixed_qkv_non_spec = causal_conv1d_update(
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index ba89f93e8b56d..5dadc52d0fb1c 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -50,6 +50,12 @@ class GDNAttentionMetadata:
         Tensor] = None  # shape: [num_prefill_tokens + num_decode_tokens,]
     num_accepted_tokens: Optional[torch.Tensor] = None  # shape: [batch,]
 
+    # The following attributes are for triton implementation of causal_conv1d
+    nums_dict: Optional[dict] = None
+    cu_seqlen: Optional[int] = None
+    batch_ptr: Optional[torch.Tensor] = None
+    token_chunk_offset_ptr: Optional[torch.Tensor] = None
+
 
 class GDNAttentionMetadataBuilder(
         AttentionMetadataBuilder[GDNAttentionMetadata]):
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 359bad1ea9dee..2fe1f14ca1db0 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -132,8 +132,8 @@ class Mamba2AttentionMetadata:
     # The following attributes are for triton implementation of causal_conv1d
     nums_dict: Optional[dict] = None
     cu_seqlen: Optional[int] = None
-    batch_ptr: Optional[torch.tensor] = None
-    token_chunk_offset_ptr: Optional[torch.tensor] = None
+    batch_ptr: Optional[torch.Tensor] = None
+    token_chunk_offset_ptr: Optional[torch.Tensor] = None
 
 
 class Mamba2AttentionMetadataBuilder(
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index f5ad65b02b4d4..717c40b37ecfb 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -34,8 +34,8 @@ class ShortConvAttentionMetadata:
     # For causal_conv1d
     nums_dict: Optional[dict] = None
     cu_seqlen: Optional[int] = None
-    batch_ptr: Optional[torch.tensor] = None
-    token_chunk_offset_ptr: Optional[torch.tensor] = None
+    batch_ptr: Optional[torch.Tensor] = None
+    token_chunk_offset_ptr: Optional[torch.Tensor] = None
 
 
 class ShortConvAttentionMetadataBuilder(

From 67244c86f0f1ffc06fcab9cad5e78989695cc15f Mon Sep 17 00:00:00 2001
From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com>
Date: Thu, 18 Sep 2025 22:29:40 +0800
Subject: [PATCH 38/58] feat(api): Return 503 on /health when engine is dead
 (#24897)

Signed-off-by: dongbo910220 <1275604947@qq.com>
Co-authored-by: Claude <noreply@anthropic.com>
---
 vllm/entrypoints/openai/api_server.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 93ea846f26f6c..912e664120929 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -103,6 +103,7 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
                         is_valid_ipv6_address, set_ulimit)
+from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -351,8 +352,11 @@ def engine_client(request: Request) -> EngineClient:
 @router.get("/health", response_class=Response)
 async def health(raw_request: Request) -> Response:
     """Health check."""
-    await engine_client(raw_request).check_health()
-    return Response(status_code=200)
+    try:
+        await engine_client(raw_request).check_health()
+        return Response(status_code=200)
+    except EngineDeadError:
+        return Response(status_code=503)
 
 
 @router.get("/load")

From 5f696c33b1fbf33fe91ecdd958874b9dd52f79b4 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 18 Sep 2025 23:22:01 +0800
Subject: [PATCH 39/58] [New Model] Support BertForTokenClassification / Named
 Entity Recognition (NER) task (#24872)

Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               | 11 +++
 examples/offline_inference/pooling/README.md  |  8 ++-
 examples/offline_inference/pooling/ner.py     | 54 ++++++++++++++
 examples/online_serving/pooling/README.md     |  6 ++
 examples/online_serving/pooling/ner.py        | 71 +++++++++++++++++++
 .../pooling/test_token_classification.py      | 39 ++++++++++
 tests/models/registry.py                      |  1 +
 vllm/entrypoints/llm.py                       |  4 ++
 vllm/model_executor/models/bert.py            | 52 ++++++++++++++
 vllm/model_executor/models/registry.py        |  1 +
 vllm/v1/attention/backends/flex_attention.py  | 12 +++-
 11 files changed, 257 insertions(+), 2 deletions(-)
 create mode 100644 examples/offline_inference/pooling/ner.py
 create mode 100644 examples/online_serving/pooling/ner.py
 create mode 100644 tests/models/language/pooling/test_token_classification.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 7aeaeca97699c..b67ebcbe3c81a 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -554,6 +554,17 @@ If your model is not in the above list, we will try to automatically convert the
     For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
     e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
 
+#### Token Classification
+
+These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API.
+
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
+| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. |  |  | ✅︎ |
+
+!!! note
+    Named Entity Recognition (NER) usage, please refer to <gh-file:examples/offline_inference/pooling/ner.py>, <gh-file:examples/online_serving/pooling/ner.py>.
+
 [](){ #supported-mm-models }
 
 ## List of Multimodal Language Models
diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md
index 8693f5e08e0ba..79afbd9cfac47 100644
--- a/examples/offline_inference/pooling/README.md
+++ b/examples/offline_inference/pooling/README.md
@@ -26,8 +26,14 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
 python examples/offline_inference/pooling/embed_matryoshka_fy.py
 ```
 
+## Named Entity Recognition (NER) usage
+
+```bash
+python examples/offline_inference/pooling/ner.py
+```
+
 ## Qwen3 reranker usage
 
 ```bash
-python qwen3_reranker.py
+python examples/offline_inference/pooling/qwen3_reranker.py
 ```
diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py
new file mode 100644
index 0000000000000..f18742fac0d54
--- /dev/null
+++ b/examples/offline_inference/pooling/ner.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="boltuix/NeuroBERT-NER",
+        runner="pooling",
+        enforce_eager=True,
+        trust_remote_code=True,
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Barack Obama visited Microsoft headquarters in Seattle on January 2025."
+    ]
+
+    # Create an LLM.
+    llm = LLM(**vars(args))
+    tokenizer = llm.get_tokenizer()
+    label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label
+
+    # Run inference
+    outputs = llm.encode(prompts)
+
+    for prompt, output in zip(prompts, outputs):
+        logits = output.outputs.data
+        predictions = logits.argmax(dim=-1)
+
+        # Map predictions to labels
+        tokens = tokenizer.convert_ids_to_tokens(output.prompt_token_ids)
+        labels = [label_map[p.item()] for p in predictions]
+
+        # Print results
+        for token, label in zip(tokens, labels):
+            if token not in tokenizer.all_special_tokens:
+                print(f"{token:15} → {label}")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md
index f7926542202d6..2c271b6a32bc2 100644
--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@@ -12,6 +12,12 @@ python examples/online_serving/pooling/cohere_rerank_client.py
 python examples/online_serving/pooling/jinaai_rerank_client.py
 ```
 
+## Named Entity Recognition (NER) usage
+
+```bash
+python examples/online_serving/pooling/ner.py
+```
+
 ## Openai chat embedding for multimodal usage
 
 ```bash
diff --git a/examples/online_serving/pooling/ner.py b/examples/online_serving/pooling/ner.py
new file mode 100644
index 0000000000000..9ec2bd45a0fe5
--- /dev/null
+++ b/examples/online_serving/pooling/ner.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER
+
+"""
+Example online usage of Pooling API for Named Entity Recognition (NER).
+
+Run `vllm serve <model> --runner pooling`
+to start up the server in vLLM. e.g.
+
+vllm serve boltuix/NeuroBERT-NER
+"""
+
+import argparse
+
+import requests
+import torch
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="boltuix/NeuroBERT-NER")
+
+    return parser.parse_args()
+
+
+def main(args):
+    from transformers import AutoConfig, AutoTokenizer
+
+    api_url = f"http://{args.host}:{args.port}/pooling"
+    model_name = args.model
+
+    # Load tokenizer and config
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    config = AutoConfig.from_pretrained(model_name)
+    label_map = config.id2label
+
+    # Input text
+    text = "Barack Obama visited Microsoft headquarters in Seattle on January 2025."
+    prompt = {"model": model_name, "input": text}
+
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+
+    # Run inference
+    output = pooling_response.json()["data"][0]
+    logits = torch.tensor(output["data"])
+    predictions = logits.argmax(dim=-1)
+    inputs = tokenizer(text, return_tensors="pt")
+
+    # Map predictions to labels
+    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+    labels = [label_map[p.item()] for p in predictions]
+    assert len(tokens) == len(predictions)
+
+    # Print results
+    for token, label in zip(tokens, labels):
+        if token not in tokenizer.all_special_tokens:
+            print(f"{token:15} → {label}")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py
new file mode 100644
index 0000000000000..fd5e48a8b1449
--- /dev/null
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForTokenClassification
+
+from tests.models.utils import softmax
+
+
+@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"])
+# The float32 is required for this tiny model to pass the test.
+@pytest.mark.parametrize("dtype", ["float"])
+@torch.inference_mode
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForTokenClassification) as hf_model:
+        tokenizer = hf_model.tokenizer
+        hf_outputs = []
+        for prompt in example_prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = hf_model.wrap_device(inputs)
+            output = hf_model.model(**inputs)
+            hf_outputs.append(softmax(output.logits[0]))
+
+    # check logits difference
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output).cpu().float()
+        vllm_output = torch.tensor(vllm_output).cpu().float()
+        assert torch.allclose(hf_output, vllm_output, 1e-2)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 93aa9d4025498..e9cc5170ade74 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -414,6 +414,7 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
 
     # [Cross-encoder]
     "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
+    "BertForTokenClassification": _HfExamplesInfo("boltuix/NeuroBERT-NER"),
     "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base",  # noqa: E501
                                                        trust_remote_code=True,
                                                        hf_overrides={
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 63e9478612bb1..df6b16c73d6e7 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -943,6 +943,10 @@ class LLM:
             considered legacy and may be deprecated in the future. You should
             instead pass them via the `inputs` parameter.
         """
+
+        if self.supported_tasks == ["encode"] and pooling_task is None:
+            pooling_task = "encode"
+
         if pooling_task is None:
             if "embed" in self.supported_tasks:
                 pooling_task = "embed"
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index c07e5364814ac..ee32587f6b1b4 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -611,3 +611,55 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
                          positions=positions,
                          inputs_embeds=inputs_embeds,
                          intermediate_tensors=intermediate_tensors)
+
+
+@default_pooling_type("ALL")
+class BertForTokenClassification(nn.Module):
+    is_pooling_model = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.head_dtype = vllm_config.model_config.head_dtype
+        self.num_labels = config.num_labels
+        self.bert = BertModel(vllm_config=vllm_config,
+                              prefix=maybe_prefix(prefix, "bert"),
+                              embedding_class=BertEmbedding)
+        self.classifier = nn.Linear(config.hidden_size,
+                                    config.num_labels,
+                                    dtype=self.head_dtype)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode":
+            Pooler.for_encode(pooler_config),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        loaded_params = loader.load_weights(weights)
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        if token_type_ids is not None:
+            assert self.bert.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
+            assert input_ids is not None
+            _encode_token_type_ids(input_ids, token_type_ids)
+
+        hidden_states = self.bert(input_ids=input_ids,
+                                  positions=positions,
+                                  inputs_embeds=inputs_embeds,
+                                  intermediate_tensors=intermediate_tensors)
+
+        hidden_states = hidden_states.to(self.head_dtype)
+        return self.classifier(hidden_states)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 707b57106e6d9..1382fd9e93ea3 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -193,6 +193,7 @@ _EMBEDDING_MODELS = {
 
 _CROSS_ENCODER_MODELS = {
     "BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
+    "BertForTokenClassification": ("bert", "BertForTokenClassification"),
     "GteNewForSequenceClassification": ("bert_with_rope",
                                         "GteNewForSequenceClassification"),
     "ModernBertForSequenceClassification": ("modernbert",
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index cb983494216a7..662d3984554ad 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -720,6 +720,15 @@ class FlexAttentionImpl(AttentionImpl):
                 (query, key, value),
             )
 
+            query = query[:, :, :num_actual_tokens, :]
+            if ((key_tensor.size(-2) > num_actual_tokens)
+                    or (value_tensor.size(-2) > num_actual_tokens)):
+                # In the encoder-only model with torch.compile,
+                # qkv might be padded, which might cause exception.
+                # see: https://github.com/vllm-project/vllm/pull/24872#discussion_r2353252290
+                key_tensor = key_tensor[:, :, :num_actual_tokens, :]
+                value_tensor = value_tensor[:, :, :num_actual_tokens, :]
+
         else:
             assert self.attn_type == AttentionType.DECODER
             key_cache, value_cache = kv_cache.unbind(0)
@@ -744,7 +753,8 @@ class FlexAttentionImpl(AttentionImpl):
                 (query, key_cache, value_cache),
             )
 
-        query = query[:, :, :num_actual_tokens, :]
+            query = query[:, :, :num_actual_tokens, :]
+
         # Doesn't work for now -> constraint violation
         # torch._dynamo.try_mark_dynamic(query, 2)
 

From b419937c78017dc4c5bfa19f11547f4832ea2290 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?=
 <ohg3417@gmail.com>
Date: Fri, 19 Sep 2025 00:23:26 +0900
Subject: [PATCH 40/58] [Docs] Fix warnings in mkdocs build (continued)
 (#25163)

Signed-off-by: Zerohertz <ohg3417@gmail.com>
---
 .../device_communicators/shm_object_storage.py            | 2 +-
 vllm/entrypoints/openai/serving_engine.py                 | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py
index 352e7525d4c84..0310fc14da256 100644
--- a/vllm/distributed/device_communicators/shm_object_storage.py
+++ b/vllm/distributed/device_communicators/shm_object_storage.py
@@ -253,7 +253,7 @@ class SingleWriterShmRingBuffer:
 
         Args:
             nbytes (int, optional): The size of the buffer to free. If None,
-            frees the maximum size of the ring buffer.
+                frees the maximum size of the ring buffer.
         '''
 
         assert self.is_writer, "Only the writer can free buffers."
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d391cc50ad232..4eb1f8b89d64f 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -697,9 +697,7 @@ class OpenAIServing:
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
         """
-        A simpler implementation of
-        [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
-        that assumes single input.
+        A simpler implementation that tokenizes a single prompt input.
         """
         async for result in self._tokenize_prompt_inputs_async(
                 request,
@@ -718,9 +716,7 @@ class OpenAIServing:
         add_special_tokens: bool = True,
     ) -> AsyncGenerator[TextTokensPrompt, None]:
         """
-        A simpler implementation of
-        [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
-        that assumes multiple inputs.
+        A simpler implementation that tokenizes multiple prompt inputs.
         """
         for prompt in prompt_inputs:
             if isinstance(prompt, str):

From 2ea50e977aac00c63e78990a7477bb91295df183 Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Thu, 18 Sep 2025 10:52:58 -0500
Subject: [PATCH 41/58] Enable Allgather/ReduceScatter backend for
 NaiveAllToAll (#23964)

Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../device_communicators/all2all.py           | 39 +++++++++++++++++++
 .../device_communicators/cuda_communicator.py |  4 ++
 vllm/envs.py                                  | 17 +++++---
 3 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 427fd040fcb71..149df73d8667b 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -5,6 +5,7 @@ from typing import Any
 import torch
 import torch.distributed as dist
 
+from vllm.distributed import get_dp_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.utils import has_deep_ep, has_pplx
@@ -69,6 +70,44 @@ class NaiveAll2AllManager(All2AllManagerBase):
         pass
 
 
+class AgRsAll2AllManager(All2AllManagerBase):
+    """
+    An implementation of all2all communication based on
+    all-gather (dispatch) and reduce-scatter (combine).
+    """
+
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
+
+    def dispatch(self, hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor):
+        """
+        Gather hidden_states and router_logits from all dp ranks.
+        """
+        sizes = get_forward_context(
+        ).dp_metadata.get_chunk_sizes_across_dp_rank()
+        hidden_states, router_logits = get_dp_group().all_gatherv(
+            [hidden_states, router_logits],
+            dim=0,
+            sizes=sizes,
+        )
+        return hidden_states, router_logits
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Reduce-scatter hidden_states across all dp ranks.
+        """
+        sizes = get_forward_context(
+        ).dp_metadata.get_chunk_sizes_across_dp_rank()
+        hidden_states = get_dp_group().reduce_scatterv(hidden_states,
+                                                       dim=0,
+                                                       sizes=sizes)
+        return hidden_states
+
+    def destroy(self):
+        pass
+
+
 class PPLXAll2AllManager(All2AllManagerBase):
     """
     All2All communication based on PPLX kernels.
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 78c90b006ffc8..b2bf3bc3cc2ed 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -87,6 +87,10 @@ class CudaCommunicator(DeviceCommunicatorBase):
                 from .all2all import NaiveAll2AllManager
                 self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
                 logger.info("Using naive all2all manager.")
+            elif all2all_backend == "allgather_reducescatter":
+                from .all2all import AgRsAll2AllManager
+                self.all2all_manager = AgRsAll2AllManager(self.cpu_group)
+                logger.info("Using AllGather-ReduceScatter all2all manager.")
             elif all2all_backend == "pplx":
                 from .all2all import PPLXAll2AllManager
                 self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
diff --git a/vllm/envs.py b/vllm/envs.py
index 72e1d5b0ede81..19e2f8635275d 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -149,8 +149,11 @@ if TYPE_CHECKING:
     VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
     VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
     VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
-    VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx", "deepep_high_throughput",
-                                  "deepep_low_latency"] = "naive"
+    VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx",
+                                  "deepep_high_throughput",
+                                  "deepep_low_latency",
+                                  "allgather_reducescatter"] = \
+                                  "allgather_reducescatter"
     VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
     VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
     VLLM_SLEEP_WHEN_IDLE: bool = False
@@ -1124,14 +1127,18 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # all2all backend for vllm's expert parallel communication
     # Available options:
-    # - "naive": naive all2all implementation using all-reduce
+    # - "naive": naive all2all implementation using broadcasts
+    # - "allgather_reducescatter": all2all implementation based on allgather and
+    #  reducescatter
     # - "pplx": use pplx kernels
     # - "deepep_high_throughput", use deepep high-throughput kernels
     # - "deepep_low_latency", use deepep low-latency kernels
     "VLLM_ALL2ALL_BACKEND":
-    env_with_choices("VLLM_ALL2ALL_BACKEND", "naive",
+    env_with_choices("VLLM_ALL2ALL_BACKEND", "allgather_reducescatter",
                      ["naive", "pplx",
-                     "deepep_high_throughput", "deepep_low_latency"]),
+                     "deepep_high_throughput",
+                     "deepep_low_latency",
+                     "allgather_reducescatter"]),
 
     # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support.
     # Both require compute capability 10.0 or above.

From 1c3b1634aa9d4be56fa6e931e96ec8145fedcc0a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 18 Sep 2025 17:01:50 +0100
Subject: [PATCH 42/58] [Misc] Add codeowner for Transformers backend (#25180)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .github/CODEOWNERS | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index b8d6db06548d5..08717cdde643a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -61,6 +61,10 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
 
+# Transformers backend
+/vllm/model_executor/models/transformers.py @hmellor
+/tests/models/test_transformers.py @hmellor
+
 # Docs
 /docs @hmellor
 mkdocs.yaml @hmellor

From c4cb0af98a8e39950fa9b99acf7c241959a14ac8 Mon Sep 17 00:00:00 2001
From: qizixi <22851944+zixi-qi@users.noreply.github.com>
Date: Thu, 18 Sep 2025 09:12:19 -0700
Subject: [PATCH 43/58] [spec decode] Fix MTP inference path for MiMo-7B model
 (#25136)

Signed-off-by: zixi-qi <qizixi@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 examples/offline_inference/spec_decode.py |  6 +++++-
 vllm/config/speculative.py                |  2 +-
 vllm/model_executor/models/mimo_mtp.py    | 18 ++++++++++++++----
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 5af232cb6af6a..004e75b204642 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -53,7 +53,6 @@ def parse_args():
         "--method",
         type=str,
         default="eagle",
-        choices=["ngram", "eagle", "eagle3", "mtp"],
     )
     parser.add_argument("--num-spec-tokens", type=int, default=2)
     parser.add_argument("--prompt-lookup-max", type=int, default=5)
@@ -118,6 +117,11 @@ def main():
             "prompt_lookup_max": args.prompt_lookup_max,
             "prompt_lookup_min": args.prompt_lookup_min,
         }
+    elif args.method.endswith("mtp"):
+        speculative_config = {
+            "method": args.method,
+            "num_speculative_tokens": args.num_spec_tokens,
+        }
     else:
         raise ValueError(f"unknown method: {args.method}")
 
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index fca8c28e5c61e..2c861723c3966 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -31,7 +31,7 @@ logger = init_logger(__name__)
 
 SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
                             "mlp_speculator", "draft_model", "deepseek_mtp",
-                            "ernie_mtp", "qwen3_next_mtp"]
+                            "ernie_mtp", "qwen3_next_mtp", "mimo_mtp"]
 
 
 @config
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index ac835edc001ea..09194e9f95d0e 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -241,6 +241,15 @@ class MiMoMTP(nn.Module):
 
     def map_model_name_to_mtp_param_name(self, name: str) -> str:
         import regex as re
+
+        # append mtp_start_layer_idx
+        pattern = r"(model\.mtp_layers\.)(\d+)(\.)"
+        match = re.match(pattern, name)
+        if match:
+            original_num = int(match.group(2))
+            new_num = original_num + self.config.num_hidden_layers
+            name = name.replace(match.group(), f"{match.group(1)}{new_num}.")
+        # check for early turn
         name_without_prefix = [
             "token_layernorm", "hidden_layernorm", "input_proj",
             "final_layernorm"
@@ -248,10 +257,11 @@ class MiMoMTP(nn.Module):
         for sub_name in name_without_prefix:
             if sub_name in name:
                 return name
-        pattern = r"model.mtp_layers.(\d+)."
-        group = re.match(pattern, name)
-        if group is not None:
-            name = name.replace(group.group(), group.group() + "mtp_block.")
+        # add mtp_block
+        pattern = r"(model\.mtp_layers\.\d+\.)"
+        match = re.match(pattern, name)
+        if match:
+            name = name.replace(match.group(), match.group() + "mtp_block.")
         return name
 
     def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:

From dc3405936090f5c964a5b38c9de8c8400f01541c Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 18 Sep 2025 12:36:55 -0400
Subject: [PATCH 44/58] [ROCm][CI/Build] Use ROCm7.0 as the base (#25178)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 docker/Dockerfile.rocm      |  5 ++-
 docker/Dockerfile.rocm_base | 61 ++++++++-----------------------------
 2 files changed, 16 insertions(+), 50 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 063fc49693288..c8900212e5a1b 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -29,7 +29,10 @@ ARG VLLM_BRANCH="main"
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
 	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
-	    && git checkout FETCH_HEAD
+	    && git checkout FETCH_HEAD \
+        && if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
+               git remote add upstream "https://github.com/vllm-project/vllm.git" \
+               && git fetch upstream ; fi
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 
 # -----------------------
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 2ba5461dfe551..4973b57f76563 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,25 +1,23 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
-ARG HIPBLASLT_BRANCH="aa0bda7b"
-ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
-ARG LEGACY_HIPBLASLT_OPTION=
-ARG TRITON_BRANCH="e5be006"
-ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="f717b2af"
-ARG PYTORCH_VISION_BRANCH="v0.21.0"
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
+ARG TRITON_BRANCH="f9e5bf54"
+ARG TRITON_REPO="https://github.com/ROCm/triton.git"
+ARG PYTORCH_BRANCH="b2fb6885"
+ARG PYTORCH_VISION_BRANCH="v0.23.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="1a7f4dfa"
+ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="4822e675"
+ARG AITER_BRANCH="2ab9f4cd"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
 
-ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+ENV AITER_ROCM_ARCH=gfx942;gfx950
 
 ARG PYTHON_VERSION=3.12
 
@@ -45,29 +43,6 @@ RUN apt-get update -y \
 
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
 
-FROM base AS build_hipblaslt
-ARG HIPBLASLT_BRANCH
-ARG HIPBLAS_COMMON_BRANCH
-# Set to "--legacy_hipblas_direct" for ROCm<=6.2
-ARG LEGACY_HIPBLASLT_OPTION
-RUN git clone https://github.com/ROCm/hipBLAS-common.git
-RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
-RUN cd hipBLAS-common \
-    && git checkout ${HIPBLAS_COMMON_BRANCH} \
-    && mkdir build \
-    && cd build \
-    && cmake .. \
-    && make package \
-    && dpkg -i ./*.deb
-RUN git clone https://github.com/ROCm/hipBLASLt
-RUN cd hipBLASLt \
-    && git checkout ${HIPBLASLT_BRANCH} \
-    && apt-get install -y llvm-dev \
-    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
-    && cd build/release \
-    && make package
-RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
-
 FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
@@ -121,13 +96,11 @@ RUN cd aiter \
     && git checkout ${AITER_BRANCH} \
     && git submodule update --init --recursive \
     && pip install -r requirements.txt
-RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
 
 FROM base AS debs
 RUN mkdir /app/debs
-RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
-    cp /install/*.deb /app/debs
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
     cp /install/*.whl /app/debs
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
@@ -138,11 +111,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
     cp /install/*.whl /app/debs
 
 FROM base AS final
-RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
-    dpkg -i /install/*deb \
-    && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
-    && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
-    && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
@@ -153,9 +121,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
     pip install /install/*.whl
 
 ARG BASE_IMAGE
-ARG HIPBLAS_COMMON_BRANCH
-ARG HIPBLASLT_BRANCH
-ARG LEGACY_HIPBLASLT_OPTION
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 ARG PYTORCH_BRANCH
@@ -167,9 +132,6 @@ ARG FA_REPO
 ARG AITER_BRANCH
 ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
-    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
-    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
-    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
     && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
     && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
@@ -177,5 +139,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
     && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file

From bbdc0f2366997536207abc212fcdae7a1b688159 Mon Sep 17 00:00:00 2001
From: Rohan Potdar <66227218+Rohan138@users.noreply.github.com>
Date: Thu, 18 Sep 2025 12:46:47 -0500
Subject: [PATCH 45/58] [ROCm][AITER][Bugfix] Switch AITER to use
 PIECEWISE_AND_FULL compilation (#25104)

Signed-off-by: Rohan138 <rohanpotdar138@gmail.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 8eb3505cf274d..afb2283c44d37 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -232,7 +232,7 @@ class AiterFlashAttentionMetadata:
 
 class AiterFlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[AiterFlashAttentionMetadata]):
-    cudagraph_support = AttentionCGSupport.ALWAYS
+    cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):

From 505805b645649be6a8e788a1f048b851fa123ef1 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Thu, 18 Sep 2025 20:57:07 +0300
Subject: [PATCH 46/58] [KV offload][1/N] Introduce an offloading component
 (#19848)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 .buildkite/test-pipeline.yaml       |   1 +
 tests/v1/offloading/test_worker.py  | 152 +++++++++++++++++++++++++
 vllm/v1/offloading/abstract.py      | 165 ++++++++++++++++++++++++++++
 vllm/v1/offloading/mediums.py       |  39 +++++++
 vllm/v1/offloading/worker/worker.py | 142 ++++++++++++++++++++++++
 5 files changed, 499 insertions(+)
 create mode 100644 tests/v1/offloading/test_worker.py
 create mode 100644 vllm/v1/offloading/abstract.py
 create mode 100644 vllm/v1/offloading/mediums.py
 create mode 100644 vllm/v1/offloading/worker/worker.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 66dfc990805f2..5fd08296625ad 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -280,6 +280,7 @@ steps:
     # split the test to avoid interference
     - pytest -v -s v1/core
     - pytest -v -s v1/executor
+    - pytest -v -s v1/offloading
     - pytest -v -s v1/sample
     - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
diff --git a/tests/v1/offloading/test_worker.py b/tests/v1/offloading/test_worker.py
new file mode 100644
index 0000000000000..2391b565773aa
--- /dev/null
+++ b/tests/v1/offloading/test_worker.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.offloading.abstract import LoadStoreSpec
+from vllm.v1.offloading.worker.worker import (OffloadingHandler,
+                                              OffloadingWorker, TransferResult,
+                                              TransferSpec)
+
+
+class LoadStoreSpec1(LoadStoreSpec):
+
+    def __init__(self,
+                 submit_success: bool = True,
+                 async_success: bool = True,
+                 exception: bool = False):
+        self.finished = False
+        self.submit_success = submit_success
+        self.async_success = async_success
+        self.exception = exception
+
+    @staticmethod
+    def medium() -> str:
+        return "1"
+
+    def __repr__(self):
+        return f"{self.medium()}: {id(self)}"
+
+
+class LoadStoreSpec2(LoadStoreSpec):
+
+    @staticmethod
+    def medium() -> str:
+        return "2"
+
+    def __repr__(self):
+        return f"{self.medium()}: {id(self)}"
+
+
+class OffloadingHandler1To2(OffloadingHandler):
+
+    def __init__(self):
+        self.transfers: dict[int, LoadStoreSpec1] = {}
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        src, dst = spec
+        assert isinstance(src, LoadStoreSpec1)
+        assert isinstance(dst, LoadStoreSpec2)
+
+        if src.exception:
+            raise Exception("An expected exception. Don't worry!")
+        if not src.submit_success:
+            return False
+
+        self.transfers[job_id] = src
+        return True
+
+    def get_finished(self) -> list[TransferResult]:
+        finished = []
+        for job_id, spec in list(self.transfers.items()):
+            if spec.finished:
+                finished.append((job_id, spec.async_success))
+                del self.transfers[job_id]
+        return finished
+
+
+class OffloadingHandler2To1(OffloadingHandler):
+
+    def __init__(self):
+        self.transfers: dict[int, LoadStoreSpec1] = {}
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        src, dst = spec
+        assert isinstance(src, LoadStoreSpec2)
+        assert isinstance(dst, LoadStoreSpec1)
+
+        self.transfers[job_id] = dst
+        return True
+
+    def get_finished(self) -> list[TransferResult]:
+        finished = []
+        for job_id, spec in list(self.transfers.items()):
+            if spec.finished:
+                finished.append((job_id, spec.async_success))
+                del self.transfers[job_id]
+        return finished
+
+
+def test_offloading_worker():
+    """
+    Tests OffloadingWorker with 2 handlers.
+    One handler performs 1->2 transfers, and the other handles 2->1.
+    """
+    worker = OffloadingWorker()
+    handler1to2 = OffloadingHandler1To2()
+    handler2to1 = OffloadingHandler2To1()
+    worker.register_handler(LoadStoreSpec1, LoadStoreSpec2, handler1to2)
+    worker.register_handler(LoadStoreSpec2, LoadStoreSpec1, handler2to1)
+
+    # 1st transfer 1->2 (exception)
+    src1 = LoadStoreSpec1(exception=True)
+    dst1 = LoadStoreSpec2()
+    assert not worker.transfer_async(1, (src1, dst1))
+
+    # 2ed transfer 1->2 (failure to submit)
+    src2 = LoadStoreSpec1(submit_success=False)
+    dst2 = LoadStoreSpec2()
+    assert not worker.transfer_async(2, (src2, dst2))
+
+    # 3rd transfer 1->2 (failure)
+    src3 = LoadStoreSpec1(async_success=False)
+    dst3 = LoadStoreSpec2()
+    assert worker.transfer_async(3, (src3, dst3))
+
+    # 4th transfer 1->2 (success)
+    src4 = LoadStoreSpec1()
+    dst4 = LoadStoreSpec2()
+    worker.transfer_async(4, (src4, dst4))
+    assert set(handler1to2.transfers.keys()) == {3, 4}
+
+    # 5th transfer 2->1
+    src5 = LoadStoreSpec2()
+    dst5 = LoadStoreSpec1()
+    worker.transfer_async(5, (src5, dst5))
+    assert set(handler2to1.transfers.keys()) == {5}
+
+    # no transfer completed yet
+    assert worker.get_finished() == []
+
+    # complete 3rd, 4th
+    src3.finished = True
+    src4.finished = True
+
+    # 6th transfer 1->2
+    src6 = LoadStoreSpec1()
+    dst6 = LoadStoreSpec2()
+    worker.transfer_async(6, (src6, dst6))
+
+    # 7th transfer 2->1
+    src7 = LoadStoreSpec2()
+    dst7 = LoadStoreSpec1()
+    worker.transfer_async(7, (src7, dst7))
+
+    # 6th and 7th transfers started
+    assert 6 in handler1to2.transfers
+    assert 7 in handler2to1.transfers
+
+    # verify result of 3rd and 4th transfers
+    assert (sorted(worker.get_finished()) == [(3, False), (4, True)])
+
+    # complete 6th and 7th transfers
+    src6.finished = True
+    dst7.finished = True
+    assert (sorted(worker.get_finished()) == [(6, True), (7, True)])
diff --git a/vllm/v1/offloading/abstract.py b/vllm/v1/offloading/abstract.py
new file mode 100644
index 0000000000000..9f9c044ea1c53
--- /dev/null
+++ b/vllm/v1/offloading/abstract.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+OffloadingManager class for managing KV data offloading in vLLM v1
+
+This class runs in the scheduler, tracks which blocks are offloaded
+and their address.
+
+The class provides the following primitives:
+    lookup() - find the length of the maximal series of blocks,
+        starting from the first one, that are all offloaded.
+    prepare_load() - prepare given blocks to be read.
+        The given blocks will be protected from eviction.
+        This function returns a LoadSpec which encapsulates
+        information required for performing the load.
+    touch() - marks the give blocks as recently used. Can be used
+        to track block's LRU. This function is separated from the
+        prepare_load function to allow setting block recency even
+        for blocks which do not need reading from the cache, such as
+        blocks that are cached by the GPU prefix cache.
+    complete_load() - mark blocks which were previously prepared to be
+        loaded as done loading. This is to re-allow their eviction.
+    prepare_store() - prepare the given blocks to be written.
+        Returns a StoreSpec encapsulating offloading information,
+        as well as a list of blocks that were evicted as a result.
+    complete_store() - marks a previous store as completed.
+        Following this call, the given blocks will become loadable.
+"""
+
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Optional
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+
+
+class LoadStoreSpec(ABC):
+    """
+    Abstract metadata that encapsulates information allowing a worker
+    to load, and optionally also to store, blocks of KV data.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def medium() -> str:
+        """
+        Returns a string representation of the medium type
+        this store/load targets.
+        """
+        pass
+
+
+@dataclass
+class PrepareStoreOutput:
+    block_hashes_to_store: list[BlockHash]
+    store_spec: LoadStoreSpec
+    block_hashes_evicted: list[BlockHash]
+
+
+@dataclass
+class OffloadingEvent:
+    block_hashes: list[BlockHash]
+    block_size: int
+    medium: str
+    # True if blocks are removed, False if stored
+    removed: bool
+
+
+class OffloadingManager(ABC):
+
+    @abstractmethod
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        """
+        Finds the length of the maximal series of blocks, starting from the
+        first one, that are all offloaded.
+
+        Args:
+            block_hashes: the hashes identifying the blocks to lookup.
+
+        Returns:
+            An integer representing the maximal number of blocks that
+            are currently offloaded.
+        """
+        pass
+
+    @abstractmethod
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        """
+        Prepare the given blocks to be read.
+        The given blocks will be protected from eviction until
+        complete_load is called.
+        It assumes all given blocks are offloaded.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+
+        Returns:
+            A LoadStoreSpec that can be used by a worker to locate and load
+            the actual offloaded KV data.
+        """
+        pass
+
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        """
+        Mark the given blocks as recently used.
+        This could in practice mean moving them to the end of an LRU list.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+        """
+        return
+
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        """
+        Marks previous blocks that were prepared to load as done loading.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+        """
+        return
+
+    @abstractmethod
+    def prepare_store(
+            self,
+            block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
+        """
+        Prepare the given blocks to be offloaded.
+        The given blocks will be protected from eviction until
+        complete_store is called.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+
+        Returns:
+            A PrepareStoreOutput indicating which blocks need storing,
+            where to store them (LoadStoreSpec), and list of blocks that
+            were evicted as a result.
+            None is returned if the blocks cannot be stored.
+        """
+        pass
+
+    def complete_store(self,
+                       block_hashes: Iterable[BlockHash],
+                       success: bool = True):
+        """
+        Marks blocks which were previously prepared to be stored, as stored.
+        Following this call, the blocks become loadable.
+        If if_success is False, blocks that were not marked as stored will be
+        removed.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+            success: whether the blocks were stored successfully.
+        """
+        return
+
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        """
+        Take the offloading events from the manager.
+
+        Yields:
+            New OffloadingEvents collected since the last call.
+        """
+        return ()
diff --git a/vllm/v1/offloading/mediums.py b/vllm/v1/offloading/mediums.py
new file mode 100644
index 0000000000000..5a1887848c9fc
--- /dev/null
+++ b/vllm/v1/offloading/mediums.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC
+
+import numpy as np
+
+from vllm.v1.offloading.abstract import LoadStoreSpec
+
+
+class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
+    """
+    Spec for loading/storing KV blocks from given block numbers.
+    """
+
+    def __init__(self, block_ids: list[int]):
+        self.block_ids = np.array(block_ids, dtype=np.int64)
+
+    def __repr__(self) -> str:
+        return repr(self.block_ids)
+
+
+class GPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to GPU memory.
+    """
+
+    @staticmethod
+    def medium() -> str:
+        return "GPU"
+
+
+class CPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to CPU memory.
+    """
+
+    @staticmethod
+    def medium() -> str:
+        return "CPU"
diff --git a/vllm/v1/offloading/worker/worker.py b/vllm/v1/offloading/worker/worker.py
new file mode 100644
index 0000000000000..d2c2045d1f1f6
--- /dev/null
+++ b/vllm/v1/offloading/worker/worker.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+
+from vllm.logger import init_logger
+from vllm.v1.offloading.abstract import LoadStoreSpec
+
+# a single transfer spec (src_blocks_spec, dst_blocks_spec)
+TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec]
+# transfers are forwarded to workers by (src_medium, dst_medium)
+TransferType = tuple[str, str]
+# transfer result (job_id, success)
+TransferResult = tuple[int, bool]
+
+logger = init_logger(__name__)
+
+
+class OffloadingHandler(ABC):
+    """
+    OffloadingHandler class for managing asynchronous KV data transfers
+
+    This class runs in the worker.
+    It kicks off async KV data transfer requests, and allows
+    collecting back completion statuses.
+
+    The class provides the following primitives:
+        transfer_async() - kicks off a new transfer job
+        get_finished() - returns a list of newly finished job IDs.
+    """
+
+    @abstractmethod
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        """
+        Initiates an asynchronous transfer of KV data.
+
+        Args:
+            job_id: a unique ID that will be used when notifying back on
+                transfer completion.
+            spec: the (src, dst) spec of the KV data transfer.
+
+        Returns:
+            True if transfer was submitted successfully.
+        """
+        pass
+
+    @abstractmethod
+    def get_finished(self) -> list[TransferResult]:
+        """
+        Get transfers finished since last call.
+
+        Returns:
+            A list of (job_id, success) of transfers.
+        """
+        pass
+
+
+class OffloadingWorker:
+    """
+    OffloadingWorker class for managing asynchronous KV data transfers
+    using multiple OffloadingHandlers
+
+    This class runs in the worker.
+    It kicks off async KV data transfer requests, by delegating
+    to one of its registered OffloadingHandlers, based on the transfer type.
+
+    The class provides the following primitives:
+        register_handler() - registers a new handler to handle
+            a specific transfer type
+        transfer_async() - kicks off a new transfer job
+            using one of the registered handlers.
+        get_finished() - returns a list of newly finished job IDs
+            from all handlers.
+    """
+
+    def __init__(self):
+        self.handlers: set[OffloadingHandler] = set()
+        self.transfer_type_to_handler: dict[TransferType,
+                                            OffloadingHandler] = {}
+
+    def register_handler(self, src_cls: type[LoadStoreSpec],
+                         dst_cls: type[LoadStoreSpec],
+                         handler: OffloadingHandler) -> None:
+        """
+        Registers a new handler.
+
+        Args:
+            src_cls: the source type of transfers handled by this handler.
+            dst_cls: the destination type of transfers handled by this handler.
+            handler: the handler that will handle transfers.
+        """
+        transfer_type = (src_cls.medium(), dst_cls.medium())
+        assert transfer_type not in self.transfer_type_to_handler
+        self.handlers.add(handler)
+        self.transfer_type_to_handler[transfer_type] = handler
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        """
+        Initiates an asynchronous transfer of KV data.
+
+        Args:
+            job_id: a unique ID that will be used when notifying back on
+                transfer completion.
+            spec: the (src, dst) spec of the KV data transfer.
+
+        Returns:
+            True if transfer was submitted successfully.
+        """
+        src, dst = spec
+        transfer_type = (src.medium(), dst.medium())
+        handler = self.transfer_type_to_handler.get(transfer_type)
+        assert handler is not None
+
+        try:
+            success = handler.transfer_async(job_id, spec)
+        except Exception as e:
+            logger.warning("Exception in %r transfer %d: %r",
+                           transfer_type,
+                           job_id,
+                           e,
+                           exc_info=True)
+            return False
+
+        if not success:
+            logger.warning("Failed to submit %r transfer %d", transfer_type,
+                           job_id)
+        else:
+            logger.debug("Submitted %r transfer %d: %r", transfer_type, job_id,
+                         spec)
+
+        return success
+
+    def get_finished(self) -> list[TransferResult]:
+        """
+        Get transfers finished since last call.
+
+        Returns:
+            A list of (job_id, success) of transfers.
+        """
+        finished = []
+        for handler in self.handlers:
+            finished.extend(handler.get_finished())
+        return finished

From e19bce40a1660cb7c03b790d0b000db155cf925d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 18 Sep 2025 11:07:42 -0700
Subject: [PATCH 47/58] [V0 Deprecation] Remove AsyncLLMEngine (#25025)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/entrypoints/openai/test_chat.py         |   54 +-
 tests/entrypoints/openai/test_completion.py   |  830 -------------
 .../test_completion_with_prompt_embeds.py     |    3 +
 .../entrypoints/openai/test_lora_adapters.py  |    5 +-
 tests/entrypoints/openai/test_metrics.py      |    2 +-
 .../openai/test_return_tokens_as_ids.py       |   26 +-
 .../entrypoints/openai/test_skip_tokenizer.py |    8 -
 tests/v1/test_oracle.py                       |   18 -
 vllm/engine/async_llm_engine.py               | 1030 +----------------
 vllm/entrypoints/launcher.py                  |    2 -
 vllm/entrypoints/openai/api_server.py         |   65 +-
 11 files changed, 76 insertions(+), 1967 deletions(-)
 delete mode 100644 tests/entrypoints/openai/test_completion.py

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index a827f94cfbfe5..3bdfef7b4adbc 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -28,11 +28,9 @@ def monkeypatch_module():
     mpatch.undo()
 
 
-@pytest.fixture(scope="module", params=[False, True])
-def server(request, monkeypatch_module, zephyr_lora_files):  #noqa: F811
-
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+@pytest.fixture(scope="module")
+def server(monkeypatch_module, zephyr_lora_files):  #noqa: F811
+    monkeypatch_module.setenv('VLLM_USE_V1', '1')
 
     args = [
         # use half precision for speed and memory savings in CI environment
@@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files):  #noqa: F811
         yield remote_server
 
 
-@pytest.fixture
-def is_v1_server(server):
-    import os
-    assert os.environ['VLLM_USE_V1'] in ['0', '1']
-    return os.environ['VLLM_USE_V1'] == '1'
-
-
 @pytest_asyncio.fixture
 async def client(server):
     async with server.get_async_client() as async_client:
@@ -481,10 +472,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_structured_outputs_choice_chat(
-        client: openai.AsyncOpenAI, sample_structured_outputs_choices,
-        is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Structured outputs is only supported in v1 engine")
+    client: openai.AsyncOpenAI,
+    sample_structured_outputs_choices,
+):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -522,12 +512,10 @@ async def test_structured_outputs_choice_chat(
 
 
 @pytest.mark.asyncio
-async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
-                                            sample_json_schema,
-                                            is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Structured outputs is only supported in v1 engine")
-
+async def test_structured_outputs_json_chat(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -569,10 +557,10 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI,
-                                             sample_regex, is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Structured outputs is only supported in v1 engine")
+async def test_structured_outputs_regex_chat(
+    client: openai.AsyncOpenAI,
+    sample_regex,
+):
 
     messages = [{
         "role": "system",
@@ -660,10 +648,10 @@ async def test_structured_outputs_choice_chat_logprobs(
 
 
 @pytest.mark.asyncio
-async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
-                              is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Tool use is only supported in v1 engine")
+async def test_named_tool_use(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -821,11 +809,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_response_format_json_schema(client: openai.AsyncOpenAI,
-                                           is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip(
-            "JSON schema response format is only supported in v1 engine")
+async def test_response_format_json_schema(client: openai.AsyncOpenAI):
     prompt = 'what is 1+1? The format is "result": 2'
     # Check that this prompt cannot lead to a valid JSON without json_schema
     for _ in range(2):
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
deleted file mode 100644
index 0347513befe32..0000000000000
--- a/tests/entrypoints/openai/test_completion.py
+++ /dev/null
@@ -1,830 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# imports for structured outputs tests
-import json
-import os
-from typing import Optional
-
-import jsonschema
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-import regex as re
-import requests
-# downloading lora to test lora requests
-from openai import BadRequestError
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-from ...utils import RemoteOpenAIServer
-
-# any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically these adapters use a different base model,
-# but we're not testing generation quality here
-
-
-@pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files):
-    return [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--max-num-seqs",
-        "128",
-        "--enforce-eager",
-        # lora config
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-    ]
-
-
-@pytest.fixture(scope="module",
-                params=["", "--disable-frontend-multiprocessing"])
-def server(default_server_args, request):
-    if request.param:
-        default_server_args.append(request.param)
-
-    original_value = os.environ.get('VLLM_USE_V1')
-    os.environ['VLLM_USE_V1'] = '0'
-    try:
-        with RemoteOpenAIServer(MODEL_NAME,
-                                default_server_args) as remote_server:
-            yield remote_server
-    finally:
-        # Restore original env value
-        if original_value is None:
-            os.environ.pop('VLLM_USE_V1', None)
-        else:
-            os.environ['VLLM_USE_V1'] = original_value
-
-
-@pytest.fixture
-def is_v1_server(server):
-    import os
-
-    # For completion tests, we assume v0 since there's no explicit v1 setup
-    return os.environ.get('VLLM_USE_V1', '0') == '1'
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-
-    choice = completion.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 1
-    assert completion.choices[0].prompt_logprobs is None
-
-
-@pytest.mark.asyncio
-async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
-    # test using token IDs
-    with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
-        # Added tokens should be rejected by the base model
-        await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 32000, 32001, 32002],
-            echo=True,
-            max_tokens=5,
-            temperature=0.0,
-        )
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=None,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=0,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) == 1
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=5,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str):
-
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        await client.completions.create(
-            model=model_name,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=21,
-        )
-        ...
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        stream = await client.completions.create(
-            model=model_name,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=30,
-            stream=True,
-        )
-        async for chunk in stream:
-            ...
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
-                                                         (MODEL_NAME, 0),
-                                                         (MODEL_NAME, 1),
-                                                         (MODEL_NAME, None)])
-async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
-                                          model_name: str,
-                                          prompt_logprobs: Optional[int]):
-    params: dict = {
-        "prompt": ["A robot may not injure another robot", "My name is"],
-        "model": model_name,
-    }
-    if prompt_logprobs is not None:
-        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
-
-    if prompt_logprobs is not None and prompt_logprobs < 0:
-        with pytest.raises(BadRequestError):
-            await client.completions.create(**params)
-    else:
-        completion = await client.completions.create(**params)
-        if prompt_logprobs is not None:
-            assert completion.choices[0].prompt_logprobs is not None
-            assert len(completion.choices[0].prompt_logprobs) > 0
-
-            assert completion.choices[1].prompt_logprobs is not None
-            assert len(completion.choices[1].prompt_logprobs) > 0
-
-        else:
-            assert completion.choices[0].prompt_logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str):
-    prompt = "What is an LLM?"
-
-    single_completion = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
-    chunks: list[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == "length"
-    assert chunk.choices[0].text
-    assert "".join(chunks) == single_output
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
-    """Streaming for parallel sampling.
-    The tokens from multiple samples, are flattened into a single stream,
-    with an index to indicate which sample the token belongs to.
-    """
-
-    prompt = "What is an LLM?"
-    n = 3
-    max_tokens = 5
-
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=max_tokens,
-                                             n=n,
-                                             stream=True)
-    chunks: list[list[str]] = [[] for i in range(n)]
-    finish_reason_count = 0
-    async for chunk in stream:
-        index = chunk.choices[0].index
-        text = chunk.choices[0].text
-        chunks[index].append(text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    assert finish_reason_count == n
-    for chunk in chunks:
-        assert len(chunk) == max_tokens
-        print("".join(chunk))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
-    prompt = "What is the capital of France?"
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": False, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
-
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": False, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": True, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": True, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is not None
-        assert chunk.usage.prompt_tokens > 0
-        assert chunk.usage.completion_tokens > 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
-        if chunk.choices[0].finish_reason is not None:
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=False, stream_options=
-    #     {"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options=
-    #    {"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
-
-    # Test stream=False, stream_options=
-    #     {"continuous_usage_stats": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"continuous_usage_stats": None})
-
-    # Test stream=False, stream_options=
-    #    {"continuous_usage_stats": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"continuous_usage_stats": True})
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test both text and token IDs
-    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
-        # test simple list
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-        )
-        assert len(batch.choices) == 2
-        assert batch.choices[0].text == batch.choices[1].text
-
-        # test n = 2
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            n=2,
-            max_tokens=5,
-            temperature=0.0,
-            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but
-                # not necessary for official client.
-                use_beam_search=True),
-        )
-        assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
-
-        # test streaming
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-            stream=True,
-        )
-        texts = [""] * 2
-        async for chunk in batch:
-            assert len(chunk.choices) == 1
-            choice = chunk.choices[0]
-            texts[choice.index] += choice.text
-        assert texts[0] == texts[1]
-
-
-@pytest.mark.asyncio
-async def test_logits_bias(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 5
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    token_id = 1000
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token_id): 100},
-        seed=42,
-    )
-    assert len(completion.choices[0].text) >= 5
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
-                                add_special_tokens=False)["input_ids"]
-    assert all([
-        response == expected
-        for response, expected in zip(response_tokens, expected_tokens)
-    ])
-
-    # Test ban
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    first_response = completion.choices[0].text
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token): -100
-                    for token in response_tokens},
-    )
-    assert first_response != completion.choices[0].text
-
-
-@pytest.mark.asyncio
-async def test_allowed_token_ids(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 1
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    allowed_ids = [21555, 21557, 21558]
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        seed=42,
-        extra_body=dict(allowed_token_ids=allowed_ids),
-        logprobs=1,
-    )
-    response_tokens = completion.choices[0].logprobs.tokens
-    assert len(response_tokens) == 1
-    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
-
-
-@pytest.mark.asyncio
-async def test_structured_outputs_json_completion(
-    client: openai.AsyncOpenAI,
-    sample_json_schema,
-    is_v1_server: bool,
-):
-    if not is_v1_server:
-        pytest.skip("structured outputs is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}",
-        n=3,
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(structured_outputs=dict(json=sample_json_schema)))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        output_json = json.loads(completion.choices[i].text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
-
-
-@pytest.mark.asyncio
-async def test_structured_outputs_regex_completion(
-    client: openai.AsyncOpenAI,
-    sample_regex,
-    is_v1_server: bool,
-):
-    if not is_v1_server:
-        pytest.skip("structured outputs is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
-        n=3,
-        temperature=1.0,
-        max_tokens=20,
-        extra_body=dict(structured_outputs=dict(regex=sample_regex)))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        assert re.fullmatch(sample_regex,
-                            completion.choices[i].text) is not None
-
-
-@pytest.mark.asyncio
-async def test_structured_outputs_choice_completion(
-    client: openai.AsyncOpenAI,
-    sample_structured_outputs_choices,
-    is_v1_server: bool,
-):
-    if not is_v1_server:
-        pytest.skip("structured outputs is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt="The best language for type-safe systems programming is ",
-        n=2,
-        temperature=1.0,
-        max_tokens=10,
-        extra_body=dict(structured_outputs=dict(
-            choice=sample_structured_outputs_choices)))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 2
-    for i in range(2):
-        assert completion.choices[i].text in sample_structured_outputs_choices
-
-
-@pytest.mark.asyncio
-async def test_structured_outputs_grammar(client: openai.AsyncOpenAI,
-                                          sample_sql_statements,
-                                          is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("grammar is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=("Generate a sql state that select col_1 from "
-                "table_1 where it is equals to 1"),
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(
-            structured_outputs=dict(grammar=sample_sql_statements), ))
-
-    content = completion.choices[0].text
-
-    # use Lark to parse the output, and make sure it's a valid parse tree
-    from lark import Lark
-    parser = Lark(sample_sql_statements)
-    parser.parse(content)
-
-    # remove spaces for comparison b/c we removed them in the grammar
-    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
-
-    assert content.strip() == ground_truth
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-@pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # test using text and token IDs
-    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
-        assert re.search(r"^" + prompt_text, completion.choices[0].text)
-        logprobs = completion.choices[0].logprobs
-        assert logprobs is not None
-        assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
-        for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
-        assert len(logprobs.tokens) > 5
-
-
-@pytest.mark.asyncio
-async def test_structured_outputs_type_error(client: openai.AsyncOpenAI,
-                                             sample_json_schema, sample_regex,
-                                             is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("structured outputs is only supported in v1 engine")
-
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(structured_outputs=dict(json=42)))
-
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example string that fits this regex",
-            extra_body=dict(structured_outputs=dict(
-                regex=sample_regex,
-                json=sample_json_schema,
-            )))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name,stream,echo",
-    [
-        (MODEL_NAME, False, False),
-        (MODEL_NAME, False, True),
-        (MODEL_NAME, True, False),
-        (MODEL_NAME, True, True)  # should not raise BadRequestError error
-    ],
-)
-async def test_echo_stream_completion(client: openai.AsyncOpenAI,
-                                      model_name: str, stream: bool,
-                                      echo: bool):
-    saying: str = "Hello, my name is"
-    result = await client.completions.create(model=model_name,
-                                             prompt=saying,
-                                             max_tokens=10,
-                                             temperature=0.0,
-                                             echo=echo,
-                                             stream=stream)
-
-    stop_reason = "length"
-
-    if not stream:
-        completion = result
-        assert completion.id is not None
-        assert completion.choices is not None and len(completion.choices) == 1
-
-        choice = completion.choices[0]
-        assert len(choice.text) >= 5
-        assert choice.finish_reason == stop_reason
-
-        if echo:
-            assert choice.text is not None and saying in choice.text
-        else:
-            assert choice.text is not None and saying not in choice.text
-
-    else:
-        chunks: list[str] = []
-        final_finish_reason = None
-        async for chunk in result:
-            if chunk.choices and chunk.choices[0].text:
-                chunks.append(chunk.choices[0].text)
-            if chunk.choices and chunk.choices[0].finish_reason:
-                final_finish_reason = chunk.choices[0].finish_reason
-
-        assert final_finish_reason == stop_reason
-        content = "".join(chunks)
-        if echo:
-            assert content is not None and saying in content
-        else:
-            assert content is not None and saying not in content
-
-
-@pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer,
-                           client: openai.AsyncOpenAI):
-    request_args = {
-        "model": MODEL_NAME,
-        "prompt": "Hello, my name is",
-        "max_tokens": 5,
-        "temperature": 0.0,
-        "logprobs": None,
-    }
-
-    completion = await client.completions.create(**request_args)
-
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
-    invocation_response.raise_for_status()
-
-    completion_output = completion.model_dump()
-    invocation_output = invocation_response.json()
-
-    assert completion_output.keys() == invocation_output.keys()
-    assert completion_output["choices"] == invocation_output["choices"]
diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index 7b58f851a4d21..3d56291bc793c 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -14,6 +14,9 @@ from transformers import AutoConfig
 
 from ...utils import RemoteOpenAIServer
 
+pytest.skip("Skipping prompt_embeds test until V1 supports it.",
+            allow_module_level=True)
+
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 10c0cb5f4d151..6f2addd3649da 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -53,12 +53,13 @@ def monkeypatch_module():
     mpatch.undo()
 
 
-@pytest.fixture(scope="module", params=[False, True])
+@pytest.fixture(scope="module", params=[True])
 def server_with_lora_modules_json(request, monkeypatch_module,
                                   zephyr_lora_files):
 
     use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+    assert use_v1
+    monkeypatch_module.setenv('VLLM_USE_V1', '1')
 
     # Define the json format LoRA module configurations
     lora_module_1 = {
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 0c9e0f3a51429..8917aa5a5efb9 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -22,7 +22,7 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 PREV_MINOR_VERSION = version._prev_minor_version()
 
 
-@pytest.fixture(scope="module", params=[True, False])
+@pytest.fixture(scope="module", params=[True])
 def use_v1(request):
     # Module-scoped variant of run_with_both_engines
     #
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 5f43fdc9588f3..ef9d5234f2317 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -10,8 +10,30 @@ import pytest
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
-from .test_completion import default_server_args  # noqa: F401
-from .test_completion import MODEL_NAME
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def default_server_args(zephyr_lora_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+    ]
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py
index 840e0dac81c97..b469fc76fc7a2 100644
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -15,14 +15,6 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 DTYPE = "float16"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def server():
     args = [
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 794c1f68f1471..28c24f62895ab 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -7,7 +7,6 @@ import pytest
 import vllm.envs as envs
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -96,20 +95,3 @@ def test_v1_attn_backend(monkeypatch):
         _ = AsyncEngineArgs(model=MODEL).create_engine_config()
         assert envs.VLLM_USE_V1
         m.delenv("VLLM_USE_V1")
-
-
-def test_reject_using_constructor_directly(monkeypatch):
-    with monkeypatch.context() as m:
-        if os.getenv("VLLM_USE_V1", None):
-            m.delenv("VLLM_USE_V1")
-
-        # Sets VLLM_USE_V1=1.
-        vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
-
-        # This uses the V0 constructor directly.
-        with pytest.raises(ValueError):
-            AsyncLLMEngine(vllm_config,
-                           AsyncLLMEngine._get_executor_cls(vllm_config),
-                           log_stats=True)
-
-        m.delenv("VLLM_USE_V1")
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 6793041abc502..ede027759a8b2 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,1032 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import asyncio
-import time
-import weakref
-from functools import partial
-from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
-                    Mapping, Optional, Set, Tuple, Type, Union)
-from weakref import ReferenceType
+from vllm.v1.engine.async_llm import AsyncLLM
 
-import vllm.envs as envs
-from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, VllmConfig)
-from vllm.core.scheduler import SchedulerOutputs
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_timeout import asyncio_timeout
-from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.metrics_types import StatLoggerBase
-from vllm.engine.protocol import EngineClient
-from vllm.executor.executor_base import ExecutorBase
-from vllm.inputs import PromptType
-from vllm.inputs.preprocess import InputPreprocessor
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.outputs import PoolingRequestOutput, RequestOutput
-from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import ExecuteModelRequest
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, deprecate_kwargs, weak_bind
-
-logger = init_logger(__name__)
-ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
-
-
-class AsyncEngineDeadError(RuntimeError):
-    pass
-
-
-def _log_task_completion(task: asyncio.Task,
-                         error_callback: Callable[[Exception], None]) -> None:
-    """This function is only intended for the `engine.run_engine_loop()` task.
-
-    In particular, that task runs a `while True` loop that can only exit if
-    there is an exception.
-    """
-
-    exception = None
-    try:
-        return_value = task.result()
-        raise AssertionError(
-            f"The engine background task should never finish without an "
-            f"exception. {return_value}")
-    except asyncio.exceptions.CancelledError:
-        # We assume that if the task is cancelled, we are gracefully shutting
-        # down. This should only happen on program exit.
-        logger.info("Engine is gracefully shutting down.")
-    except Exception as e:
-        exception = e
-        logger.error("Engine background task failed", exc_info=e)
-        error_callback(exception)
-        raise AsyncEngineDeadError(
-            "Task finished unexpectedly. This should never happen! "
-            "Please open an issue on GitHub. See stack trace above for the "
-            "actual cause.") from e
-
-
-STOP_ITERATION = Exception()  # Sentinel
-
-
-class AsyncStream:
-    """A stream of RequestOutputs for a request that can be iterated over
-    asynchronously via an async generator."""
-
-    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
-        self.request_id = request_id
-        self._cancel = cancel
-        self._queue: asyncio.Queue = asyncio.Queue()
-        self._finished = False
-
-    def put(self, item: Union[RequestOutput, Exception]) -> None:
-        if not self._finished:
-            self._queue.put_nowait(item)
-
-    def finish(
-        self,
-        exception: Optional[Union[BaseException, Type[BaseException]]] = None,
-    ) -> None:
-        if not self._finished:
-            self._finished = True
-            self._queue.put_nowait(
-                exception if self._is_raisable(exception) else STOP_ITERATION)
-
-    @property
-    def finished(self) -> bool:
-        return self._finished
-
-    async def generator(self) -> AsyncGenerator[RequestOutput, None]:
-        try:
-            while True:
-                result = await self._queue.get()
-                if self._is_raisable(result):
-                    if result == STOP_ITERATION:
-                        return
-                    raise result
-                yield result
-        except GeneratorExit:
-            self._cancel(self.request_id)
-            raise asyncio.CancelledError from None
-
-    @staticmethod
-    def _is_raisable(value: Any):
-        return isinstance(value, BaseException) or \
-                (isinstance(value, type) and \
-                 issubclass(value, BaseException))
-
-
-class RequestTracker:
-    """Synchronous abstraction for tracking requests."""
-
-    def __init__(self) -> None:
-        self._request_streams: Dict[str, AsyncStream] = {}
-        self._aborted_requests: asyncio.Queue[str] = asyncio.Queue()
-        self._new_requests: asyncio.Queue[Tuple[AsyncStream,
-                                                dict]] = asyncio.Queue()
-        self.new_requests_event = asyncio.Event()
-
-    def __contains__(self, item):
-        return item in self._request_streams
-
-    def __len__(self) -> int:
-        return len(self._request_streams)
-
-    def propagate_exception(self,
-                            exc: Exception,
-                            request_id: Optional[str] = None) -> None:
-        """Propagate an exception to request streams
-        (all if request_id is None)."""
-        if request_id is not None:
-            self.abort_request(request_id, exception=exc)
-        else:
-            # NB: tuple() used here because self.abort_request pops the stream
-            # out of self._request_streams, so we can't iterate on it directly
-            for rid in tuple(self._request_streams.keys()):
-                self.abort_request(rid, exception=exc)
-
-    def process_request_output(self,
-                               request_output: RequestOutput,
-                               *,
-                               verbose: bool = False) -> None:
-        """Process a request output from the engine."""
-        request_id = request_output.request_id
-        finished = request_output.finished
-
-        if finished:
-            stream = self._request_streams.pop(request_id, None)
-        else:
-            stream = self._request_streams.get(request_id)
-        # Guard against a KeyError which can occur if the request was aborted
-        # while the output was generated
-        if stream is not None:
-            stream.put(request_output)
-            if finished:
-                stream.finish()
-
-        if verbose and finished:
-            logger.info("Finished request %s.", request_id)
-
-    def process_exception(self,
-                          request_id: str,
-                          exception: BaseException,
-                          *,
-                          verbose: bool = False) -> None:
-        """Propagate an exception from the engine."""
-        if verbose:
-            logger.info("Finished request %s.", request_id)
-        self.abort_request(request_id, exception=exception)
-
-    def add_request(self,
-                    request_id: str,
-                    *,
-                    verbose: bool = False,
-                    **engine_add_request_kwargs) -> AsyncStream:
-        """Add a request to be sent to the engine on the next background
-        loop iteration."""
-        if request_id in self._request_streams:
-            raise KeyError(f"Request {request_id} already exists.")
-
-        abort_request = partial(self.abort_request, verbose=verbose)
-        stream = AsyncStream(request_id, abort_request)
-        self._new_requests.put_nowait((stream, {
-            "request_id": request_id,
-            **engine_add_request_kwargs
-        }))
-
-        self.new_requests_event.set()
-
-        if verbose:
-            logger.info("Added request %s.", request_id)
-
-        return stream
-
-    def abort_request(self,
-                      request_id: str,
-                      *,
-                      exception: Optional[Union[BaseException,
-                                                Type[BaseException]]] = None,
-                      verbose: bool = False) -> None:
-        """Abort a request during next background loop iteration."""
-        if verbose:
-            logger.info("Aborted request %s.", request_id)
-
-        self._aborted_requests.put_nowait(request_id)
-
-        stream = self._request_streams.pop(request_id, None)
-        if stream is not None:
-            stream.finish(exception=exception)
-
-    def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]:
-        """Get the new requests and finished requests to be
-        sent to the engine."""
-        new_requests: List[Dict] = []
-        finished_requests: Set[str] = set()
-
-        while not self._aborted_requests.empty():
-            request_id = self._aborted_requests.get_nowait()
-            finished_requests.add(request_id)
-
-        while not self._new_requests.empty():
-            stream, new_request = self._new_requests.get_nowait()
-            request_id = stream.request_id
-            if request_id in finished_requests:
-                # The request has already been aborted.
-                stream.finish(asyncio.CancelledError)
-                finished_requests.discard(request_id)
-            else:
-                self._request_streams[request_id] = stream
-                new_requests.append(new_request)
-
-        return new_requests, finished_requests
-
-    async def wait_for_new_requests(self):
-        if not self.has_new_requests():
-            await self.new_requests_event.wait()
-        self.new_requests_event.clear()
-
-    def has_new_requests(self):
-        return not self._new_requests.empty()
-
-
-class _AsyncLLMEngine(LLMEngine):
-    """Extension of LLMEngine to add async methods."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    async def step_async(self, virtual_engine: int) -> List[RequestOutput]:
-        """Performs one decoding iteration and returns newly generated results.
-        The workers are ran asynchronously if possible.
-
-        This function performs one decoding iteration of the engine. It first
-        schedules the sequences to be executed in the next iteration and the
-        token blocks to be swapped in/out/copy. Then, it executes the model
-        and updates the scheduler with the model outputs. Finally, it decodes
-        the sequences and returns the newly generated results.
-        """
-        # these are cached outputs from previous iterations. None if on first
-        # iteration
-        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
-        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
-        scheduler_outputs = cached_outputs.scheduler_outputs
-        allow_async_output_proc = cached_outputs.allow_async_output_proc
-
-        ctx = self.scheduler_contexts[virtual_engine]
-
-        # Clear outputs for each new scheduler iteration
-        ctx.request_outputs.clear()
-
-        # skip the scheduler if there are any remaining steps in the seq groups.
-        # This ensures that the scheduler is only called again when the current
-        # batch has completed.
-        if not self._has_remaining_steps(seq_group_metadata_list):
-
-            # Schedule iteration
-            (seq_group_metadata_list, scheduler_outputs,
-             allow_async_output_proc
-             ) = self.scheduler[virtual_engine].schedule()
-
-            ctx.seq_group_metadata_list = seq_group_metadata_list
-            ctx.scheduler_outputs = scheduler_outputs
-
-            if not scheduler_outputs.is_empty():
-                # this will cause mamba_cache/minimax_cache failed
-                # to release finished_requests_ids of the last steps
-                finished_requests_ids = self.scheduler[
-                    virtual_engine].get_and_reset_finished_requests_ids()
-
-            # Maybe switch from async mode to sync mode
-            if not allow_async_output_proc and len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-
-        else:
-            finished_requests_ids = list()
-
-        assert seq_group_metadata_list is not None
-        assert scheduler_outputs is not None
-
-        if not scheduler_outputs.is_empty():
-
-            # Check if we have a cached last_output from the previous iteration.
-            # For supporting PP this is probably the best way to pass the
-            # sampled_token_ids, as a separate broadcast over all the PP stages
-            # will cause one virtual engine's microbatch to block the pipeline.
-            last_sampled_token_ids = \
-                self._get_last_sampled_token_ids(virtual_engine)
-
-            execute_model_req = ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list,
-                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
-                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
-                blocks_to_copy=scheduler_outputs.blocks_to_copy,
-                virtual_engine=virtual_engine,
-                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
-                running_queue_size=scheduler_outputs.running_queue_size,
-                finished_requests_ids=finished_requests_ids,
-                # We use ExecuteModelRequest to pass the last sampled_token_ids
-                # to each of the non-last PP stages for in-place prepare_input.
-                last_sampled_token_ids=last_sampled_token_ids)
-
-            if allow_async_output_proc:
-                execute_model_req.async_callback = self.async_callbacks[
-                    virtual_engine]
-
-            # Execute the model.
-            outputs = await self.model_executor.execute_model_async(
-                execute_model_req)
-
-        else:
-            if len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-            outputs = []
-
-        if not self._has_remaining_steps(seq_group_metadata_list):
-            # is_first_step_output is True only when the num_steps of all
-            # the sequences are 1.
-            is_first_step_output: bool = False if not seq_group_metadata_list \
-                else seq_group_metadata_list[0].state.num_steps == 1
-
-            ctx.append_output(outputs=outputs,
-                              seq_group_metadata_list=seq_group_metadata_list,
-                              scheduler_outputs=scheduler_outputs,
-                              is_async=allow_async_output_proc,
-                              is_last_step=True,
-                              is_first_step_output=is_first_step_output)
-
-            if outputs and allow_async_output_proc:
-                assert len(
-                    outputs
-                ) == 1, "Async postprocessor expects only a single output set"
-                self._advance_to_next_step(
-                    outputs[0], seq_group_metadata_list,
-                    scheduler_outputs.scheduled_seq_groups)
-
-            if not allow_async_output_proc:
-                self._process_model_outputs(ctx=ctx)
-
-                # Log stats.
-                self.do_log_stats(scheduler_outputs, outputs)
-
-                # Tracing
-                self.do_tracing(scheduler_outputs)
-
-        else:
-            # Multi-step case
-            return ctx.request_outputs
-
-        if not self.has_unfinished_requests():
-            # Drain async postprocessor (if exists)
-            if len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-            assert len(ctx.output_queue) == 0
-
-        return ctx.request_outputs
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        """Stop the remote worker execution loop."""
-        await self.model_executor.stop_remote_worker_execution_loop_async()
-
-    async def get_tokenizer_async(self) -> AnyTokenizer:
-        return self.get_tokenizer()
-
-    async def add_request_async(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: SamplingParams,
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> None:
-        """
-        Async version of
-        [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].
-        """
-        if lora_request is not None and not self.lora_config:
-            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
-                             "not enabled!")
-        if priority != 0 and not self.scheduler_config.policy == "priority":
-            raise ValueError(f"Got priority {priority} but "
-                             "Priority scheduling is not enabled.")
-        if arrival_time is None:
-            arrival_time = time.time()
-
-        if data_parallel_rank is not None:
-            raise ValueError("Targeting data_parallel_rank only supported "
-                             "in v1 client.")
-
-        if (isinstance(prompt, dict)
-                and prompt.get("prompt_embeds", None) is not None
-                and not prompt.get("prompt_token_ids", None)):
-            # We use the -2 dimension (instead of 0) in case a batched input
-            # of batch size 1 is passed in.
-            prompt["prompt_token_ids"] = [0
-                                          ] * prompt["prompt_embeds"].shape[-2]
-
-        processed_inputs = await self.input_preprocessor.preprocess_async(
-            prompt,
-            tokenization_kwargs=tokenization_kwargs,
-        )
-
-        self._add_processed_request(
-            request_id=request_id,
-            processed_inputs=processed_inputs,
-            params=params,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            priority=priority,
-        )
-
-    async def check_health_async(self) -> None:
-        self.model_executor.check_health()
-
-    async def collective_rpc_async(self,
-                                   method: str,
-                                   timeout: Optional[float] = None,
-                                   args: tuple = (),
-                                   kwargs: Optional[dict] = None):
-        raise NotImplementedError
-
-
-class AsyncLLMEngine(EngineClient):
-    """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].
-
-    This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
-    make it asynchronous. It uses asyncio to create a background loop that keeps
-    processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
-    by the generate method when there are requests in the waiting queue. The
-    generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
-    to the caller.
-
-    Args:
-        log_requests: Whether to log the requests.
-        start_engine_loop: If True, the background task to run the engine
-            will be automatically started in the generate call.
-        *args: Arguments for [`LLMEngine`][vllm.LLMEngine].
-        **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
-    """
-
-    _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
-
-    def __init__(self,
-                 *args: Any,
-                 log_requests: bool = True,
-                 start_engine_loop: bool = True,
-                 **kwargs: Any) -> None:
-        if envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
-                "This should not happen. As a workaround, try using "
-                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
-
-        self.log_requests = log_requests
-        self.engine = self._engine_class(*args, **kwargs)
-
-        # This ensures quick processing of request outputs
-        # so the append to asyncio queues is not delayed,
-        # especially for multi-step.
-        self.use_process_request_outputs_callback = (
-            self.engine.model_config.use_async_output_proc)
-
-        if self.use_process_request_outputs_callback:
-            self.engine.process_request_outputs_callback = \
-                weak_bind(self.process_request_outputs)
-
-        self.background_loop: Optional[asyncio.Future] = None
-        # We need to keep a reference to unshielded
-        # task as well to prevent it from being garbage
-        # collected
-        self._background_loop_unshielded: Optional[asyncio.Task] = None
-        self.start_engine_loop = start_engine_loop
-        self._errored_with: Optional[BaseException] = None
-
-        # Lazy initialized fields
-        self._request_tracker: RequestTracker
-
-    def __del__(self):
-        if rt := getattr(self, "request_tracker", None):
-            # Wake up engine loop so that it will exit cleanly
-            rt.new_requests_event.set()
-
-    @classmethod
-    def _get_executor_cls(cls,
-                          engine_config: VllmConfig) -> Type[ExecutorBase]:
-        return LLMEngine._get_executor_cls(engine_config)
-
-    @classmethod
-    @deprecate_kwargs(
-        "disable_log_requests",
-        additional_message=("This argument will have no effect. "
-                            "Use `enable_log_requests` instead."),
-    )
-    def from_vllm_config(
-            cls,
-            vllm_config: VllmConfig,
-            start_engine_loop: bool = True,
-            usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-            stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
-            enable_log_requests: bool = False,
-            disable_log_stats: bool = False,
-            disable_log_requests: bool = True,  # Deprecated, will be removed
-    ) -> "AsyncLLMEngine":
-        """Create an AsyncLLMEngine from the EngineArgs."""
-
-        return cls(
-            vllm_config=vllm_config,
-            executor_class=cls._get_executor_cls(vllm_config),
-            start_engine_loop=start_engine_loop,
-            log_requests=enable_log_requests,
-            log_stats=not disable_log_stats,
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-        )
-
-    @classmethod
-    def from_engine_args(
-        cls,
-        engine_args: AsyncEngineArgs,
-        start_engine_loop: bool = True,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-    ) -> "AsyncLLMEngine":
-        """Creates an async LLM engine from the engine arguments."""
-
-        vllm_config = engine_args.create_engine_config(usage_context)
-
-        async_engine_cls = cls
-        if envs.VLLM_USE_V1:
-            from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine
-            async_engine_cls = V1AsyncLLMEngine
-
-        return async_engine_cls.from_vllm_config(
-            vllm_config=vllm_config,
-            start_engine_loop=start_engine_loop,
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-            disable_log_stats=engine_args.disable_log_stats,
-            enable_log_requests=engine_args.enable_log_requests,
-        )
-
-    @property
-    def is_running(self) -> bool:
-        return (self.background_loop is not None
-                and self._background_loop_unshielded is not None
-                and not self._background_loop_unshielded.done())
-
-    @property
-    def is_stopped(self) -> bool:
-        return self.errored or (self.background_loop is not None and
-                                self._background_loop_unshielded is not None
-                                and self._background_loop_unshielded.done())
-
-    @property
-    def errored(self) -> bool:
-        return self._errored_with is not None
-
-    @property
-    def dead_error(self) -> BaseException:
-        return AsyncEngineDeadError(
-            "Background loop is not running. If it was running, "
-            "inspect the output to find the stacktrace of the "
-            "error that caused the background loop to stop "
-            "(AsyncEngineDeadError).")
-
-    def set_errored(self, exc: Exception) -> None:
-        self._errored_with = exc
-
-    def _error_callback(self, exc: Exception) -> None:
-        self.set_errored(exc)
-        self._request_tracker.propagate_exception(exc)
-
-    async def get_input_preprocessor(self) -> InputPreprocessor:
-        return self.engine.input_preprocessor
-
-    async def get_tokenizer(self) -> AnyTokenizer:
-        return self.engine.get_tokenizer()
-
-    def start_background_loop(self) -> None:
-        """Start the background loop."""
-        if self.errored:
-            raise AsyncEngineDeadError(
-                "Background loop has errored already.") from self._errored_with
-        if self.is_running:
-            raise RuntimeError("Background loop is already running.")
-        # Initialize the RequestTracker here so it uses the right event loop.
-        self._request_tracker = RequestTracker()
-
-        self._background_loop_unshielded = asyncio.get_event_loop(
-        ).create_task(self.run_engine_loop(weakref.ref(self)))
-        self._background_loop_unshielded.add_done_callback(
-            partial(_log_task_completion, error_callback=self._error_callback))
-        self.background_loop = asyncio.shield(self._background_loop_unshielded)
-
-    def shutdown_background_loop(self) -> None:
-        """
-        Shut down the background loop.
-
-        This method needs to be called during cleanup to remove
-        references to `self` and properly GC the resources held
-        by the async LLM engine (e.g., the executors as well as
-        their resources).
-        """
-        if self._background_loop_unshielded is not None:
-            self._background_loop_unshielded.cancel()
-            self._background_loop_unshielded = None
-        self.background_loop = None
-
-    async def engine_step(self, virtual_engine: int) -> bool:
-        """Kick the engine to process the waiting requests.
-
-        Returns True if there are in-progress requests."""
-
-        new_requests, aborted_requests = (
-            self._request_tracker.get_new_and_aborted_requests())
-
-        for new_request in new_requests:
-            # Add the request into the vLLM engine's waiting queue.
-            try:
-                await self.engine.add_request_async(**new_request)
-            except ValueError as e:
-                # TODO: use a vLLM specific error for failed validation
-                self._request_tracker.process_exception(
-                    new_request["request_id"],
-                    e,
-                    verbose=self.log_requests,
-                )
-
-        if aborted_requests:
-            await self._engine_abort(aborted_requests)
-
-        request_outputs = await self.engine.step_async(virtual_engine)
-
-        # Put the outputs into the corresponding streams.
-        # If used as a callback, then already invoked inside
-        # LLMEngine's _process_model_outputs
-        if not self.use_process_request_outputs_callback:
-            all_finished = self.process_request_outputs(request_outputs)
-        else:
-            # For callback case, we only need to detect when all
-            # requests are finished
-            all_finished = all(request_output.finished
-                               for request_output in request_outputs)
-
-        return not all_finished
-
-    def process_request_outputs(self, request_outputs) -> bool:
-        # Put the outputs into the corresponding streams.
-        all_finished = True
-        for request_output in request_outputs:
-            self._request_tracker.process_request_output(
-                request_output, verbose=self.log_requests)
-            all_finished = all_finished and request_output.finished
-
-        return all_finished
-
-    async def _engine_abort(self, request_ids: Iterable[str]):
-        self.engine.abort_request(request_ids)
-
-    @staticmethod
-    async def run_engine_loop(engine_ref: ReferenceType):
-        """We use a weakref to the engine so that the running loop
-        doesn't prevent the engine being garbage collected."""
-        engine: Optional[AsyncLLMEngine] = engine_ref()
-        if not engine:
-            return
-
-        pipeline_parallel_size = \
-                engine.engine.parallel_config.pipeline_parallel_size
-        has_requests_in_progress = [False] * pipeline_parallel_size
-        while True:
-            if not any(has_requests_in_progress):
-                logger.debug("Waiting for new requests...")
-                # Stop the execute model loop in parallel workers until there
-                # are more requests to process. This avoids waiting
-                # indefinitely in torch.distributed ops which may otherwise
-                # time out, and unblocks the RPC thread in the workers so that
-                # they can process any other queued control plane messages,
-                # such as add/remove lora adapters.
-                await engine.engine.stop_remote_worker_execution_loop_async()
-                request_tracker = engine._request_tracker
-                # Allow engine to be garbage collected while
-                # waiting for new requests
-                del engine
-                await asyncio.sleep(0)
-                if engine_ref() is None:
-                    return
-                await request_tracker.wait_for_new_requests()
-                engine = engine_ref()
-                if not engine:
-                    return
-                logger.debug("Got new requests!")
-                requests_in_progress = [
-                    asyncio.create_task(engine.engine_step(ve))
-                    for ve in range(pipeline_parallel_size)
-                ]
-                has_requests_in_progress = [True] * pipeline_parallel_size
-
-            # Abort if iteration takes too long due to unrecoverable errors
-            # (eg. NCCL timeouts).
-            try:
-                async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
-                    done, _ = await asyncio.wait(
-                        requests_in_progress,
-                        return_when=asyncio.FIRST_COMPLETED)
-                    for _ in range(pipeline_parallel_size):
-                        await asyncio.sleep(0)
-                for task in done:
-                    result = task.result()
-                    virtual_engine = requests_in_progress.index(task)
-                    has_unfinished_requests = (
-                        engine.engine.
-                        has_unfinished_requests_for_virtual_engine(
-                            virtual_engine))
-                    if result or has_unfinished_requests:
-                        requests_in_progress[virtual_engine] = (
-                            asyncio.create_task(
-                                engine.engine_step(virtual_engine)))
-                        has_requests_in_progress[virtual_engine] = True
-                    else:
-                        has_requests_in_progress[virtual_engine] = False
-            except asyncio.TimeoutError as exc:
-                logger.error(
-                    "Engine iteration timed out. This should never happen!")
-                engine.set_errored(exc)
-                raise
-            await asyncio.sleep(0)
-
-    async def add_request(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: SamplingParams,
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        if not self.is_running:
-            if self.start_engine_loop:
-                self.start_background_loop()
-            else:
-                raise AsyncEngineDeadError(
-                    "Background loop is not running. If it was running, "
-                    "inspect the output to find the stacktrace of the "
-                    "error that caused the background loop to stop "
-                    "(AsyncEngineDeadError).")
-
-        if (priority != 0
-                and not self.engine.scheduler_config.policy == "priority"):
-            raise ValueError(f"Got priority {priority} but "
-                             "Priority scheduling is not enabled.")
-
-        stream = self._request_tracker.add_request(
-            request_id,
-            verbose=self.log_requests,
-            prompt=prompt,
-            params=params,
-            arrival_time=arrival_time or time.time(),
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            priority=priority,
-            data_parallel_rank=data_parallel_rank,
-            tokenization_kwargs=tokenization_kwargs,
-        )
-
-        return stream.generator()
-
-    async def generate(
-        self,
-        prompt: PromptType,
-        sampling_params: SamplingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        """Generate outputs for a request.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            sampling_params: The sampling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-            priority: The priority of the request.
-                Only applicable with priority scheduling.
-            data_parallel_rank: The (global) data parallel rank that must
-                handle this request. Only applicable if DP is enabled.
-        Yields:
-            The output `RequestOutput` objects from the LLMEngine
-            for the request.
-
-        Details:
-            - If the engine is not running, start the background loop,
-              which iteratively invokes
-              [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
-              to process the waiting requests.
-            - Add the request to the engine's `RequestTracker`.
-              On the next background loop, this request will be sent to
-              the underlying engine.
-              Also, a corresponding `AsyncStream` will be created.
-            - Wait for the request outputs from `AsyncStream` and yield them.
-
-        Example:
-            >>> # Please refer to entrypoints/api_server.py for
-            >>> # the complete example.
-            >>>
-            >>> # initialize the engine and the example input
-            >>> # note that engine_args here is AsyncEngineArgs instance
-            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
-            >>> example_input = {
-            >>>     "prompt": "What is LLM?",
-            >>>     "stream": False, # assume the non-streaming case
-            >>>     "temperature": 0.0,
-            >>>     "request_id": 0,
-            >>> }
-            >>>
-            >>> # start the generation
-            >>> results_generator = engine.generate(
-            >>>    example_input["prompt"],
-            >>>    SamplingParams(temperature=example_input["temperature"]),
-            >>>    example_input["request_id"])
-            >>>
-            >>> # get the results
-            >>> final_output = None
-            >>> async for request_output in results_generator:
-            >>>     if await request.is_disconnected():
-            >>>         # Abort the request if the client disconnects.
-            >>>         await engine.abort(request_id)
-            >>>         # Return or raise an error
-            >>>         ...
-            >>>     final_output = request_output
-            >>>
-            >>> # Process and return the final output
-            >>> ...
-        """
-        try:
-            async for output in await self.add_request(
-                    request_id,
-                    prompt,
-                    sampling_params,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    priority=priority,
-                    data_parallel_rank=data_parallel_rank,
-            ):
-                yield LLMEngine.validate_output(output, RequestOutput)
-        except asyncio.CancelledError:
-            await self.abort(request_id)
-            raise
-
-    def encode(
-        self,
-        prompt: PromptType,
-        pooling_params: PoolingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        raise NotImplementedError(
-            "Pooling models are not supported in vLLM V0")
-
-    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
-        """Abort a request.
-
-        Abort a submitted request. If the request is finished or not found,
-        this method will be a no-op.
-
-        Args:
-            request_id: The unique id of the request.
-        """
-        if not isinstance(request_id, str):
-            raise RuntimeError("Only single-request abort supported in"
-                               " deprecated V0")
-        if not self.is_running:
-            raise AsyncEngineDeadError(
-                "Background loop is not running. If it was running, "
-                "inspect the output to find the stacktrace of the "
-                "error that caused the background loop to stop "
-                "(AsyncEngineDeadError).")
-
-        return self._abort(request_id)
-
-    def _abort(self, request_id: str) -> None:
-        """Abort a request.
-
-        Abort a submitted request. If the request is finished or not found,
-        this method will be a no-op.
-
-        Args:
-            request_id: The unique id of the request.
-        """
-        self._request_tracker.abort_request(request_id,
-                                            exception=asyncio.CancelledError,
-                                            verbose=self.log_requests)
-
-    async def get_vllm_config(self) -> VllmConfig:
-        """Get the vllm configuration of the vLLM engine."""
-        return self.engine.get_vllm_config()
-
-    async def get_model_config(self) -> ModelConfig:
-        """Get the model configuration of the vLLM engine."""
-        return self.engine.get_model_config()
-
-    async def get_parallel_config(self) -> ParallelConfig:
-        """Get the parallel configuration of the vLLM engine."""
-        return self.engine.get_parallel_config()
-
-    async def get_scheduler_config(self) -> SchedulerConfig:
-        """Get the scheduling configuration of the vLLM engine."""
-        return self.engine.get_scheduler_config()
-
-    async def get_lora_config(self) -> LoRAConfig:
-        """Get the lora configuration of the vLLM engine."""
-        return self.engine.get_lora_config()
-
-    async def do_log_stats(
-            self,
-            scheduler_outputs: Optional[SchedulerOutputs] = None,
-            model_output: Optional[List[SamplerOutput]] = None) -> None:
-        self.engine.do_log_stats()
-
-    async def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        t = time.perf_counter()
-        logger.debug("Starting health check...")
-        if self.is_stopped:
-            raise AsyncEngineDeadError("Background loop is stopped.")
-
-        await self.engine.check_health_async()
-        logger.debug("Health check took %fs", time.perf_counter() - t)
-
-    async def is_tracing_enabled(self) -> bool:
-        return self.engine.is_tracing_enabled()
-
-    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
-        self.engine.add_logger(logger_name=logger_name, logger=logger)
-
-    def remove_logger(self, logger_name: str) -> None:
-        self.engine.remove_logger(logger_name=logger_name)
-
-    async def start_profile(self) -> None:
-        self.engine.start_profile()
-
-    async def stop_profile(self) -> None:
-        self.engine.stop_profile()
-
-    async def reset_mm_cache(self) -> None:
-        self.engine.reset_mm_cache()
-
-    async def reset_prefix_cache(self,
-                                 device: Optional[Device] = None) -> None:
-        self.engine.reset_prefix_cache(device)
-
-    async def sleep(self, level: int = 1) -> None:
-        await self.reset_prefix_cache()
-        self.engine.sleep(level)
-
-    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        self.engine.wake_up(tags)
-
-    async def is_sleeping(self) -> bool:
-        return self.engine.is_sleeping()
-
-    async def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.engine.add_lora(lora_request)
-
-    async def collective_rpc(self,
-                             method: str,
-                             timeout: Optional[float] = None,
-                             args: tuple = (),
-                             kwargs: Optional[dict] = None):
-        """
-        Perform a collective RPC call to the given path.
-        """
-        return await self.engine.collective_rpc_async(method, timeout, args,
-                                                      kwargs)
-
-
-# TODO(v1): Remove this class proxy when V1 goes default.
-if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
-    from vllm.v1.engine.async_llm import AsyncLLM
-
-    AsyncLLMEngine = AsyncLLM  # type: ignore
+AsyncLLMEngine = AsyncLLM  # type: ignore
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index c3195dbc4697f..8b2acedf805c1 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -11,7 +11,6 @@ import uvicorn
 from fastapi import FastAPI, Request, Response
 
 from vllm import envs
-from vllm.engine.async_llm_engine import AsyncEngineDeadError
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
                                         H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
@@ -154,7 +153,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
     """
 
     @app.exception_handler(RuntimeError)
-    @app.exception_handler(AsyncEngineDeadError)
     @app.exception_handler(EngineDeadError)
     @app.exception_handler(EngineGenerateError)
     async def runtime_exception_handler(request: Request, __):
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 912e664120929..11031cd616d20 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -38,7 +38,6 @@ from typing_extensions import assert_never
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (load_chat_template,
                                          resolve_hf_chat_template,
@@ -201,50 +200,34 @@ async def build_async_engine_client_from_engine_args(
     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
 
     # V1 AsyncLLM.
-    if envs.VLLM_USE_V1:
-        if disable_frontend_multiprocessing:
-            logger.warning(
-                "V1 is enabled, but got --disable-frontend-multiprocessing. "
-                "To disable frontend multiprocessing, set VLLM_USE_V1=0.")
+    assert envs.VLLM_USE_V1
 
-        from vllm.v1.engine.async_llm import AsyncLLM
-        async_llm: Optional[AsyncLLM] = None
-        client_count = client_config.pop(
-            "client_count") if client_config else 1
-        client_index = client_config.pop(
-            "client_index") if client_config else 0
-        try:
-            async_llm = AsyncLLM.from_vllm_config(
-                vllm_config=vllm_config,
-                usage_context=usage_context,
-                enable_log_requests=engine_args.enable_log_requests,
-                disable_log_stats=engine_args.disable_log_stats,
-                client_addresses=client_config,
-                client_count=client_count,
-                client_index=client_index)
+    if disable_frontend_multiprocessing:
+        logger.warning(
+            "V1 is enabled, but got --disable-frontend-multiprocessing. "
+            "To disable frontend multiprocessing, set VLLM_USE_V1=0.")
 
-            # Don't keep the dummy data in memory
-            await async_llm.reset_mm_cache()
+    from vllm.v1.engine.async_llm import AsyncLLM
+    async_llm: Optional[AsyncLLM] = None
+    client_count = client_config.pop("client_count") if client_config else 1
+    client_index = client_config.pop("client_index") if client_config else 0
+    try:
+        async_llm = AsyncLLM.from_vllm_config(
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            enable_log_requests=engine_args.enable_log_requests,
+            disable_log_stats=engine_args.disable_log_stats,
+            client_addresses=client_config,
+            client_count=client_count,
+            client_index=client_index)
 
-            yield async_llm
-        finally:
-            if async_llm:
-                async_llm.shutdown()
+        # Don't keep the dummy data in memory
+        await async_llm.reset_mm_cache()
 
-    # V0 AsyncLLM.
-    else:
-
-        engine_client: Optional[EngineClient] = None
-        try:
-            engine_client = AsyncLLMEngine.from_vllm_config(
-                vllm_config=vllm_config,
-                usage_context=usage_context,
-                enable_log_requests=engine_args.enable_log_requests,
-                disable_log_stats=engine_args.disable_log_stats)
-            yield engine_client
-        finally:
-            if engine_client and hasattr(engine_client, "shutdown"):
-                engine_client.shutdown()
+        yield async_llm
+    finally:
+        if async_llm:
+            async_llm.shutdown()
 
 
 async def validate_json_request(raw_request: Request):

From 064cac7bb7251862a841d8057d83581350edf837 Mon Sep 17 00:00:00 2001
From: Nikhil Gupta <nikhil.gupta2@arm.com>
Date: Thu, 18 Sep 2025 19:15:23 +0100
Subject: [PATCH 48/58] [fix]: remove data type hardcoding from gptoss model
 implementation (#23807)

Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com>
---
 vllm/model_executor/models/gpt_oss.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 990a1d6d883a1..b49fd0d8f88af 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -76,7 +76,6 @@ class OAIAttention(nn.Module):
 
         self.sinks = torch.nn.Parameter(
             torch.empty(config.num_attention_heads // tp_size,
-                        dtype=torch.bfloat16,
                         requires_grad=False))
 
         self.q_size = self.num_attention_heads * self.head_dim // tp_size
@@ -145,8 +144,7 @@ class MLPBlock(torch.nn.Module):
         self.experts_per_token = config.num_experts_per_tok
         self.world_size = dist.get_world_size() if dist.is_initialized() else 1
         self.router = torch.nn.Linear(config.hidden_size,
-                                      config.num_local_experts,
-                                      dtype=torch.bfloat16)
+                                      config.num_local_experts)
         assert config.intermediate_size % self.world_size == 0
         self.experts = FusedMoE(num_experts=config.num_local_experts,
                                 top_k=config.num_experts_per_tok,

From 38db529f66712502a3cf93488229fc9fd2dc76fc Mon Sep 17 00:00:00 2001
From: Aziz <azizbenothman76@gmail.com>
Date: Thu, 18 Sep 2025 21:18:56 +0200
Subject: [PATCH 49/58] [feat]: Create interface for model-specific M-RoPE
 (#24194)

Signed-off-by: AzizCode92 <azizbenothman76@gmail.com>
Signed-off-by: Aziz <azizbenothman76@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/__init__.py   |  11 ++-
 vllm/model_executor/models/interfaces.py |  68 +++++++++++++
 vllm/model_executor/models/qwen2_vl.py   | 118 ++++++++++++++++++++++-
 vllm/v1/worker/gpu_model_runner.py       |  33 +++++--
 vllm/worker/model_runner.py              |  42 +++++---
 5 files changed, 242 insertions(+), 30 deletions(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index d3ee6872dd8bf..4ccba64f2c110 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
-                         SupportsPP, SupportsTranscription, SupportsV0Only,
-                         has_inner_state, supports_lora, supports_multimodal,
-                         supports_pp, supports_transcription, supports_v0_only)
+from .interfaces import (HasInnerState, SupportsLoRA, SupportsMRoPE,
+                         SupportsMultiModal, SupportsPP, SupportsTranscription,
+                         SupportsV0Only, has_inner_state, supports_lora,
+                         supports_mrope, supports_multimodal, supports_pp,
+                         supports_transcription, supports_v0_only)
 from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
                               is_pooling_model, is_text_generation_model)
 from .registry import ModelRegistry
@@ -21,6 +22,8 @@ __all__ = [
     "supports_lora",
     "SupportsMultiModal",
     "supports_multimodal",
+    "SupportsMRoPE",
+    "supports_mrope",
     "SupportsPP",
     "supports_pp",
     "SupportsTranscription",
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 8f8e300c84d71..e9c600e36cfa7 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -8,6 +8,7 @@ from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
 import numpy as np
 import torch
 from torch import Tensor
+from transformers import PretrainedConfig
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs
 
@@ -852,3 +853,70 @@ def supports_eagle3(
     model: Union[type[object], object],
 ) -> Union[TypeIs[type[SupportsEagle3]], TypeIs[SupportsEagle3]]:
     return isinstance(model, SupportsEagle3)
+
+
+@runtime_checkable
+class SupportsMRoPE(Protocol):
+    """The interface required for all models that support M-RoPE."""
+
+    supports_mrope: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports M-RoPE.
+    
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        second_per_grid_ts: Optional[list[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """
+        Get M-RoPE input positions and delta value for this specific model.
+        
+        This method should be implemented by each model that supports M-RoPE
+        to provide model-specific logic for computing input positions.
+        
+        Args:
+            input_tokens: List of input token IDs
+            hf_config: HuggingFace model configuration
+            image_grid_thw: Image grid dimensions (t, h, w)
+            video_grid_thw: Video grid dimensions (t, h, w)
+            second_per_grid_ts: Seconds per grid timestep for videos
+            context_len: Context length
+            seq_len: Sequence length
+            audio_feature_lengths: Audio feature lengths for multimodal models
+            use_audio_in_video: Whether to use audio in video for interleaving
+            
+        Returns:
+            Tuple of (llm_positions, mrope_position_delta)
+            - llm_positions: Tensor of shape [3, num_tokens]
+                with T/H/W positions
+            - mrope_position_delta: Delta for position calculations
+        """
+        ...
+
+
+@overload
+def supports_mrope(model: type[object]) -> TypeIs[type[SupportsMRoPE]]:
+    ...
+
+
+@overload
+def supports_mrope(model: object) -> TypeIs[SupportsMRoPE]:
+    ...
+
+
+def supports_mrope(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[SupportsMRoPE]], TypeIs[SupportsMRoPE]]:
+    return isinstance(model, SupportsMRoPE)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index b6576b783b64a..7f361678ba72e 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -32,7 +32,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from transformers import AutoConfig, BatchFeature
+from transformers import AutoConfig, BatchFeature, PretrainedConfig
 from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
                                           Qwen2VLProcessor)
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
@@ -73,7 +73,7 @@ from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMRoPE,
                          SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix,
@@ -1096,7 +1096,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
                                         info=Qwen2VLProcessingInfo,
                                         dummy_inputs=Qwen2VLDummyInputsBuilder)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsLoRA, SupportsPP):
+                                      SupportsLoRA, SupportsPP, SupportsMRoPE):
 
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(
@@ -1109,6 +1109,118 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.": "language_model.model.",
         })
 
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        second_per_grid_ts: Optional[list[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """Get M-RoPE input positions for Qwen2-VL model."""
+        if image_grid_thw is None:
+            image_grid_thw = []
+        if video_grid_thw is None:
+            video_grid_thw = []
+        if second_per_grid_ts is None:
+            second_per_grid_ts = []
+
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(hf_config.vision_config,
+                                    "tokens_per_second", 1.0)
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_videos > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_second_per_grid_t = 1.0
+                if second_per_grid_ts:
+                    video_second_per_grid_t = second_per_grid_ts[video_index]
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = \
+                t, h // spatial_merge_size, w // spatial_merge_size
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
+                       tokens_per_second).long().flatten()
+
+            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                llm_grid_t, -1, llm_grid_w).flatten()
+            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                llm_grid_t, llm_grid_h, -1).flatten()
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4873b586724ec..053e8f0537ed9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -42,6 +42,7 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (is_mixture_of_experts,
                                                    supports_eagle3,
+                                                   supports_mrope,
                                                    supports_transcription)
 from vllm.model_executor.models.interfaces_base import (
     VllmModelForPooling, is_pooling_model, is_text_generation_model)
@@ -730,16 +731,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             if mm_input.get("use_audio_in_video") is True:
                 use_audio_in_video = True
 
-        req_state.mrope_positions, req_state.mrope_position_delta = \
-            MRotaryEmbedding.get_input_positions_tensor(
-                req_state.prompt_token_ids,
-                hf_config=self.model_config.hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                audio_feature_lengths=audio_feature_lengths,
-                use_audio_in_video=use_audio_in_video,
-            )
+        if supports_mrope(self.model):
+            req_state.mrope_positions, req_state.mrope_position_delta = \
+                self.model.get_mrope_input_positions(
+                    req_state.prompt_token_ids,
+                    hf_config=self.model_config.hf_config,
+                    image_grid_thw=image_grid_thw,
+                    video_grid_thw=video_grid_thw,
+                    second_per_grid_ts=second_per_grid_ts,
+                    audio_feature_lengths=audio_feature_lengths,
+                    use_audio_in_video=use_audio_in_video,
+                )
+        else:
+            req_state.mrope_positions, req_state.mrope_position_delta = \
+                MRotaryEmbedding.get_input_positions_tensor(
+                    req_state.prompt_token_ids,
+                    hf_config=self.model_config.hf_config,
+                    image_grid_thw=image_grid_thw,
+                    video_grid_thw=video_grid_thw,
+                    second_per_grid_ts=second_per_grid_ts,
+                    audio_feature_lengths=audio_feature_lengths,
+                    use_audio_in_video=use_audio_in_video,
+                )
 
     def _extract_mm_kwargs(
         self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 88f83c9dd7e6c..594382650e3c1 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -41,7 +41,8 @@ from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
                                                 get_sampler)
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.model_executor.models import supports_lora, supports_multimodal
+from vllm.model_executor.models import (supports_lora, supports_mrope,
+                                        supports_multimodal)
 from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalKwargs, MultiModalPlaceholderMap,
@@ -670,18 +671,33 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                     inter_data.seq_ids[seq_idx]]
                 token_ids = seq_data.get_token_ids()
 
-                mrope_input_positions, mrope_position_delta = \
-                    MRotaryEmbedding.get_input_positions(
-                        token_ids,
-                        hf_config=hf_config,
-                        image_grid_thw=image_grid_thw,
-                        video_grid_thw=video_grid_thw,
-                        second_per_grid_ts=second_per_grid_ts,
-                        context_len=inter_data.context_lens[seq_idx],
-                        seq_len=inter_data.seq_lens[seq_idx],
-                        audio_feature_lengths=audio_feature_lengths,
-                        use_audio_in_video=use_audio_in_video,
-                    )
+                if supports_mrope(self.runner.model):
+                    mrope_input_positions, mrope_position_delta = \
+                        self.runner.model.get_mrope_input_positions(
+                            token_ids,
+                            hf_config=hf_config,
+                            image_grid_thw=image_grid_thw,
+                            video_grid_thw=video_grid_thw,
+                            second_per_grid_ts=second_per_grid_ts,
+                            context_len=inter_data.context_lens[seq_idx],
+                            seq_len=inter_data.seq_lens[seq_idx],
+                            audio_feature_lengths=audio_feature_lengths,
+                            use_audio_in_video=use_audio_in_video,
+                        )
+                    mrope_input_positions = mrope_input_positions.tolist()
+                else:
+                    mrope_input_positions, mrope_position_delta = \
+                        MRotaryEmbedding.get_input_positions(
+                            token_ids,
+                            hf_config=hf_config,
+                            image_grid_thw=image_grid_thw,
+                            video_grid_thw=video_grid_thw,
+                            second_per_grid_ts=second_per_grid_ts,
+                            context_len=inter_data.context_lens[seq_idx],
+                            seq_len=inter_data.seq_lens[seq_idx],
+                            audio_feature_lengths=audio_feature_lengths,
+                            use_audio_in_video=use_audio_in_video,
+                        )
 
                 seq_data.mrope_position_delta = mrope_position_delta
                 inter_data.mrope_input_positions[

From 75fb112d80f680624dc99a00e02be6a45661f948 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 18 Sep 2025 15:32:24 -0400
Subject: [PATCH 50/58] [Bug] Fix `returned_lse` not Defined issue (#25106)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/v1/attention/backends/mla/cutlass_mla.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 21be17a750df4..ae534f3207b51 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -206,12 +206,11 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         )
 
         if H < MAX_HEADS:
-            # Extract the subsets of the outputs
-            returned_lse = lse[:, :H].contiguous(
-            ) if self.need_to_return_lse_for_decode else lse
             out = out[:, :H]
+            if self.need_to_return_lse_for_decode:
+                lse = lse[:, :H].contiguous()
 
-        return out, returned_lse
+        return out, lse
 
     def _forward_decode(
         self,

From d2a30a2d933226d3951ad98cb5de0c74e2e64826 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 18 Sep 2025 15:38:37 -0400
Subject: [PATCH 51/58] [Bug] Fix torch Compilation Cache Hit Error (#25093)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/config/compilation.py | 12 ------------
 vllm/platforms/cuda.py     | 17 ++++++++++-------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index f8ccc20222615..3618f472e742d 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -563,18 +563,6 @@ class CompilationConfig:
                 self.cudagraph_mode = CUDAGraphMode.FULL
             self.splitting_ops = []
 
-        if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":
-            # exclude MoE dispatch/combine from capture by ensuring
-            # piecewise splitting includes them, so communication remains
-            # outside CUDA graphs while compute can still be graphed.
-            moe_ops = [
-                "vllm.moe_forward",
-                "vllm.moe_forward_shared",
-            ]
-            for op in moe_ops:
-                if op not in self.splitting_ops:
-                    self.splitting_ops.append(op)
-
     def splitting_ops_contain_attention(self) -> bool:
         return self.splitting_ops is not None and all(
             op in self.splitting_ops for op in self._attention_ops)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 8e3436a9e73c5..87d8f2b7481bb 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -191,14 +191,17 @@ class CudaPlatformBase(Platform):
         compilation_config = vllm_config.compilation_config
         if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
                 and parallel_config.data_parallel_size > 1
-                and compilation_config.cudagraph_mode
-                not in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE]):
+                and compilation_config.cudagraph_mode != CUDAGraphMode.NONE):
+            # TODO: Piecewise Cuda graph might be enabled
+            # if torch compile cache key issue fixed
+            # See https://github.com/vllm-project/vllm/pull/25093
             logger.info(
-                "Data Parallel with DeepEP high-throughput: using PIECEWISE "
-                "CUDA graphs and excluding MoE ops from capture. Set "
-                "VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE "
-                "graphs captured as well.")
-            compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+                "Data Parallel: disabling cudagraphs since DP "
+                "with DeepEP high-throughput kernels are not CUDA Graph "
+                "compatible. The DeepEP low-latency kernels are CUDA Graph "
+                "compatible. Set the all_to_all backend to deepep_low_latency "
+                "to use those kernels instead.")
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
     @classmethod
     def get_current_memory_usage(cls,

From 1c3dad22ff92cbf84e0fa8ad1643c560a07944ea Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 18 Sep 2025 13:35:21 -0700
Subject: [PATCH 52/58] [V0 Deprecation] Remove unused async_timeout.py
 (#25190)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/engine/async_timeout.py | 173 -----------------------------------
 1 file changed, 173 deletions(-)
 delete mode 100644 vllm/engine/async_timeout.py

diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
deleted file mode 100644
index 3b9c055160c1b..0000000000000
--- a/vllm/engine/async_timeout.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Workaround for https://github.com/python/cpython/issues/86296
-#
-# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py
-# Licensed under the Apache License (Apache-2.0)
-
-import asyncio
-import enum
-import sys
-from types import TracebackType
-from typing import Any, Optional, Type
-
-if sys.version_info[:2] >= (3, 11):
-    from asyncio import timeout as asyncio_timeout
-else:
-
-    class _State(enum.Enum):
-        INIT = "INIT"
-        ENTER = "ENTER"
-        TIMEOUT = "TIMEOUT"
-        EXIT = "EXIT"
-
-    class Timeout:
-        # Internal class, please don't instantiate it directly
-        # Use timeout() and timeout_at() public factories instead.
-        #
-        # Implementation note: `async with timeout()` is preferred
-        # over `with timeout()`.
-        # While technically the Timeout class implementation
-        # doesn't need to be async at all,
-        # the `async with` statement explicitly points that
-        # the context manager should be used from async function context.
-        #
-        # This design allows to avoid many silly misusages.
-        #
-        # TimeoutError is raised immediately when scheduled
-        # if the deadline is passed.
-        # The purpose is to time out as soon as possible
-        # without waiting for the next await expression.
-
-        __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler")
-
-        def __init__(self, deadline: Optional[float],
-                     loop: asyncio.AbstractEventLoop) -> None:
-            self._loop = loop
-            self._state = _State.INIT
-
-            self._timeout_handler = None  # type: Optional[asyncio.Handle]
-            if deadline is None:
-                self._deadline = None  # type: Optional[float]
-            else:
-                self.update(deadline)
-
-        async def __aenter__(self) -> "Timeout":
-            self._do_enter()
-            return self
-
-        async def __aexit__(
-            self,
-            exc_type: Optional[Type[BaseException]],
-            exc_val: Optional[BaseException],
-            exc_tb: Optional[TracebackType],
-        ) -> Optional[bool]:
-            self._do_exit(exc_type)
-            return None
-
-        @property
-        def expired(self) -> bool:
-            """Is timeout expired during execution?"""
-            return self._state == _State.TIMEOUT
-
-        @property
-        def deadline(self) -> Optional[float]:
-            return self._deadline
-
-        def reject(self) -> None:
-            """Reject scheduled timeout if any."""
-            # cancel is maybe better name but
-            # task.cancel() raises CancelledError in asyncio world.
-            if self._state not in (_State.INIT, _State.ENTER):
-                raise RuntimeError(f"invalid state {self._state.value}")
-            self._reject()
-
-        def _reject(self) -> None:
-            if self._timeout_handler is not None:
-                self._timeout_handler.cancel()
-                self._timeout_handler = None
-
-        def shift(self, delay: float) -> None:
-            """Advance timeout on delay seconds.
-            The delay can be negative.
-            Raise RuntimeError if shift is called when deadline is not scheduled
-            """
-            deadline = self._deadline
-            if deadline is None:
-                raise RuntimeError(
-                    "cannot shift timeout if deadline is not scheduled")
-            self.update(deadline + delay)
-
-        def update(self, deadline: float) -> None:
-            """Set deadline to absolute value.
-            deadline argument points on the time in the same clock system
-            as loop.time().
-            If new deadline is in the past the timeout is raised immediately.
-            Please note: it is not POSIX time but a time with
-            undefined starting base, e.g. the time of the system power on.
-            """
-            if self._state == _State.EXIT:
-                raise RuntimeError(
-                    "cannot reschedule after exit from context manager")
-            if self._state == _State.TIMEOUT:
-                raise RuntimeError("cannot reschedule expired timeout")
-            if self._timeout_handler is not None:
-                self._timeout_handler.cancel()
-            self._deadline = deadline
-            if self._state != _State.INIT:
-                self._reschedule()
-
-        def _reschedule(self) -> None:
-            assert self._state == _State.ENTER
-            deadline = self._deadline
-            if deadline is None:
-                return
-
-            now = self._loop.time()
-            if self._timeout_handler is not None:
-                self._timeout_handler.cancel()
-
-            task = asyncio.current_task()
-            if deadline <= now:
-                self._timeout_handler = self._loop.call_soon(
-                    self._on_timeout, task)
-            else:
-                self._timeout_handler = self._loop.call_at(
-                    deadline, self._on_timeout, task)
-
-        def _do_enter(self) -> None:
-            if self._state != _State.INIT:
-                raise RuntimeError(f"invalid state {self._state.value}")
-            self._state = _State.ENTER
-            self._reschedule()
-
-        def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None:
-            if exc_type is asyncio.CancelledError and \
-                    self._state == _State.TIMEOUT:
-                self._timeout_handler = None
-                raise asyncio.TimeoutError
-            # timeout has not expired
-            self._state = _State.EXIT
-            self._reject()
-            return None
-
-        def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None:
-            if task:
-                task.cancel()
-            self._state = _State.TIMEOUT
-            # drop the reference early
-            self._timeout_handler = None
-
-    def asyncio_timeout(delay: Optional[float]) -> Timeout:
-        """timeout context manager.
-        Useful in cases when you want to apply timeout logic around block
-        of code or in cases when asyncio.wait_for is not suitable. For example:
-        >>> async with timeout(0.001):
-        ...     async with aiohttp.get('https://github.com') as r:
-        ...         await r.text()
-        delay - value in seconds or None to disable timeout logic
-        """
-        loop = asyncio.get_running_loop()
-        deadline = loop.time() + delay if delay is not None else None
-        return Timeout(deadline, loop)

From a53ad626d629e79264f0a6ab6820a4b547f3b1c4 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Thu, 18 Sep 2025 23:53:52 +0300
Subject: [PATCH 53/58] [KV offload][1b/N] rename offloading to kv_offload
 (#25191)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 .buildkite/test-pipeline.yaml                       | 2 +-
 tests/v1/{offloading => kv_offload}/test_worker.py  | 4 ++--
 vllm/v1/{offloading => kv_offload}/abstract.py      | 0
 vllm/v1/{offloading => kv_offload}/mediums.py       | 2 +-
 vllm/v1/{offloading => kv_offload}/worker/worker.py | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename tests/v1/{offloading => kv_offload}/test_worker.py (97%)
 rename vllm/v1/{offloading => kv_offload}/abstract.py (100%)
 rename vllm/v1/{offloading => kv_offload}/mediums.py (93%)
 rename vllm/v1/{offloading => kv_offload}/worker/worker.py (98%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5fd08296625ad..c42ec4f2503d0 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -280,7 +280,7 @@ steps:
     # split the test to avoid interference
     - pytest -v -s v1/core
     - pytest -v -s v1/executor
-    - pytest -v -s v1/offloading
+    - pytest -v -s v1/kv_offload
     - pytest -v -s v1/sample
     - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
diff --git a/tests/v1/offloading/test_worker.py b/tests/v1/kv_offload/test_worker.py
similarity index 97%
rename from tests/v1/offloading/test_worker.py
rename to tests/v1/kv_offload/test_worker.py
index 2391b565773aa..6cf8aa0875d62 100644
--- a/tests/v1/offloading/test_worker.py
+++ b/tests/v1/kv_offload/test_worker.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.v1.offloading.abstract import LoadStoreSpec
-from vllm.v1.offloading.worker.worker import (OffloadingHandler,
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
                                               OffloadingWorker, TransferResult,
                                               TransferSpec)
 
diff --git a/vllm/v1/offloading/abstract.py b/vllm/v1/kv_offload/abstract.py
similarity index 100%
rename from vllm/v1/offloading/abstract.py
rename to vllm/v1/kv_offload/abstract.py
diff --git a/vllm/v1/offloading/mediums.py b/vllm/v1/kv_offload/mediums.py
similarity index 93%
rename from vllm/v1/offloading/mediums.py
rename to vllm/v1/kv_offload/mediums.py
index 5a1887848c9fc..8962819178459 100644
--- a/vllm/v1/offloading/mediums.py
+++ b/vllm/v1/kv_offload/mediums.py
@@ -4,7 +4,7 @@ from abc import ABC
 
 import numpy as np
 
-from vllm.v1.offloading.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
 
 
 class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
diff --git a/vllm/v1/offloading/worker/worker.py b/vllm/v1/kv_offload/worker/worker.py
similarity index 98%
rename from vllm/v1/offloading/worker/worker.py
rename to vllm/v1/kv_offload/worker/worker.py
index d2c2045d1f1f6..b7a52a088fb90 100644
--- a/vllm/v1/offloading/worker/worker.py
+++ b/vllm/v1/kv_offload/worker/worker.py
@@ -3,7 +3,7 @@
 from abc import ABC, abstractmethod
 
 from vllm.logger import init_logger
-from vllm.v1.offloading.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
 
 # a single transfer spec (src_blocks_spec, dst_blocks_spec)
 TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec]

From 9fac6aa30b669de75d8718164cd99676d3530e7d Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 18 Sep 2025 17:26:28 -0400
Subject: [PATCH 54/58] [BugFix] Fix DeepGEMM warmup, no m.weight_scale_inv
 (#25206)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/model_executor/warmup/deep_gemm_warmup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py
index a636a714145cf..4d1829cd228cd 100644
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@@ -36,7 +36,7 @@ def _extract_data_from_linear_base_module(
     assert m.quant_method.quant_config is not None
 
     w = m.weight
-    ws = m.weight_scale_inv
+    ws = m.weight_scale
     quant_block_size = m.quant_method.quant_config.weight_block_size
 
     assert isinstance(w, torch.Tensor)

From 9a4600e4dcbbd13988c31d5198d3ab8b4172ecca Mon Sep 17 00:00:00 2001
From: Andrew Sansom <andrew@protopia.ai>
Date: Thu, 18 Sep 2025 19:03:09 -0500
Subject: [PATCH 55/58] [CORE] Prompt Embeddings Support for v1 Engine (#24278)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Andrew Sansom <qthequartermasterman@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../test_basic_correctness.py                 | 10 --
 .../test_completion_with_prompt_embeds.py     |  1 -
 .../models/language/generation/test_common.py |  6 --
 vllm/engine/arg_utils.py                      | 24 +++--
 vllm/entrypoints/openai/protocol.py           |  2 +-
 vllm/utils/__init__.py                        | 27 +++++
 vllm/v1/core/sched/output.py                  | 24 +++--
 vllm/v1/engine/__init__.py                    |  3 +-
 vllm/v1/engine/detokenizer.py                 | 33 ++++---
 vllm/v1/engine/output_processor.py            | 25 ++++-
 vllm/v1/engine/processor.py                   | 38 +++++--
 vllm/v1/request.py                            | 15 ++-
 vllm/v1/sample/logits_processor/__init__.py   |  2 +-
 vllm/v1/sample/logits_processor/builtin.py    |  6 +-
 vllm/v1/sample/logits_processor/interface.py  |  2 +-
 vllm/v1/serial_utils.py                       |  2 +-
 vllm/v1/worker/gpu_input_batch.py             | 55 +++++++++--
 vllm/v1/worker/gpu_model_runner.py            | 99 ++++++++++++++++++-
 vllm/v1/worker/tpu_input_batch.py             |  6 +-
 vllm/v1/worker/tpu_model_runner.py            |  1 +
 20 files changed, 305 insertions(+), 76 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index fba18f197074b..24b1c9a93126c 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -76,11 +76,6 @@ def test_models(
     model_executor: str,
     enable_prompt_embeds: bool,
 ) -> None:
-
-    if enable_prompt_embeds and envs.is_set(
-            "VLLM_USE_V1") and envs.VLLM_USE_V1:
-        pytest.skip("enable_prompt_embeds is not supported in v1.")
-
     if not envs.VLLM_USE_V1:
         if async_scheduling:
             pytest.skip("async_scheduling only supported in v1.")
@@ -164,11 +159,6 @@ def test_models_distributed(
     extra_env: dict[str, str],
     enable_prompt_embeds: bool,
 ) -> None:
-
-    if enable_prompt_embeds and envs.is_set(
-            "VLLM_USE_V1") and envs.VLLM_USE_V1:
-        pytest.skip("enable_prompt_embeds is not supported in v1.")
-
     if test_suite != TARGET_TEST_SUITE:
         pytest.skip(f"Skip test for {test_suite}")
 
diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
index 3d56291bc793c..0e3fc82f0c033 100644
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@@ -36,7 +36,6 @@ def default_server_args() -> list[str]:
         "--enforce-eager",
         # Prompt Embeds server args
         "--enable-prompt-embeds",
-        "--no-enable-chunked-prefill",
     ]
 
 
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index a5aa1e3f49743..c14e71cbdb96d 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -125,12 +125,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
         # in parts of the operators
         pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
 
-    # Note: can be removed when
-    # https://github.com/vllm-project/vllm/pull/24278 finished
-    if current_platform.is_cpu() and use_prompt_embeds:
-        pytest.skip("Skipping use_prompt_embeds=True with "
-                    "V1-only CPU backend.")
-
     with hf_runner(model) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index fb5beab77b270..63282c4253509 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1513,12 +1513,6 @@ class EngineArgs:
                                    recommend_to_remove=False)
                 return False
 
-        # No text embedding inputs so far.
-        if self.enable_prompt_embeds:
-            _raise_or_fallback(feature_name="--enable-prompt-embeds",
-                               recommend_to_remove=False)
-            return False
-
         # No Mamba or Encoder-Decoder so far.
         if not model_config.is_v1_compatible:
             _raise_or_fallback(feature_name=model_config.architectures,
@@ -1651,6 +1645,13 @@ class EngineArgs:
                 "models in V0 and has been disabled.")
             self.enable_prefix_caching = False
 
+            if self.enable_prompt_embeds:
+                logger.warning(
+                    "--enable-prompt-embeds and --enable-prefix-caching "
+                    "are not supported together in V0. Prefix caching has "
+                    "been disabled.")
+                self.enable_prefix_caching = False
+
         # Set max_num_seqs to 256 for VLLM_V0.
         if self.max_num_seqs is None:
             self.max_num_seqs = 256
@@ -1664,6 +1665,17 @@ class EngineArgs:
         # For pooling tasks the default is False
         if model_config.runner_type != "pooling":
             self.enable_chunked_prefill = True
+
+            # TODO: When prefix caching supports prompt embeds inputs, this
+            # check can be removed.
+            if (self.enable_prompt_embeds
+                    and self.enable_prefix_caching is not False):
+                logger.warning(
+                    "--enable-prompt-embeds and --enable-prefix-caching "
+                    "are not supported together in V1. Prefix caching has "
+                    "been disabled.")
+                self.enable_prefix_caching = False
+
             if self.enable_prefix_caching is None:
                 self.enable_prefix_caching = True
         else:
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 7ad8e73d89d59..6b54511a66f33 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -973,7 +973,6 @@ class CompletionRequest(OpenAIBaseModel):
     # https://platform.openai.com/docs/api-reference/completions/create
     model: Optional[str] = None
     prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
-    prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
     best_of: Optional[int] = None
     echo: Optional[bool] = False
     frequency_penalty: Optional[float] = 0.0
@@ -1009,6 +1008,7 @@ class CompletionRequest(OpenAIBaseModel):
     # --8<-- [end:completion-sampling-params]
 
     # --8<-- [start:completion-extra-params]
+    prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
     add_special_tokens: bool = Field(
         default=True,
         description=(
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index f13381ecd9ff3..d4013a69e99fe 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3443,3 +3443,30 @@ def decorate_logs(process_name: Optional[str] = None) -> None:
     pid = os.getpid()
     _add_prefix(sys.stdout, process_name, pid)
     _add_prefix(sys.stderr, process_name, pid)
+
+
+def length_from_prompt_token_ids_or_embeds(
+    prompt_token_ids: Optional[list[int]],
+    prompt_embeds: Optional[torch.Tensor],
+) -> int:
+    """Calculate the request length (in number of tokens) give either 
+    prompt_token_ids or prompt_embeds.
+    """
+    prompt_token_len = None if prompt_token_ids is None else len(
+        prompt_token_ids)
+    prompt_embeds_len = \
+        None if prompt_embeds is None else len(prompt_embeds)
+
+    if prompt_token_len is None:
+        if prompt_embeds_len is None:
+            raise ValueError(
+                "Neither prompt_token_ids nor prompt_embeds were defined.")
+        return prompt_embeds_len
+    else:
+        if (prompt_embeds_len is not None
+                and prompt_embeds_len != prompt_token_len):
+            raise ValueError(
+                "Prompt token ids and prompt embeds had different lengths"
+                f" prompt_token_ids={prompt_token_len}"
+                f" prompt_embeds={prompt_embeds_len}")
+        return prompt_token_len
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 3ec5b91bf2860..209fc2a4404f3 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -11,6 +11,7 @@ from vllm._bc_linter import bc_linter_include
 if TYPE_CHECKING:
     import numpy as np
     import numpy.typing as npt
+    import torch
 
     from vllm.distributed.kv_transfer.kv_connector.v1.base import (
         KVConnectorMetadata)
@@ -26,13 +27,14 @@ if TYPE_CHECKING:
 class NewRequestData:
 
     req_id: str
-    prompt_token_ids: list[int]
+    prompt_token_ids: Optional[list[int]]
     mm_features: list[MultiModalFeatureSpec]
     sampling_params: Optional[SamplingParams]
     pooling_params: Optional[PoolingParams]
     block_ids: tuple[list[int], ...]
     num_computed_tokens: int
     lora_request: Optional[LoRARequest]
+    prompt_embeds: Optional[torch.Tensor] = None
 
     @classmethod
     def from_request(
@@ -49,9 +51,12 @@ class NewRequestData:
             block_ids=block_ids,
             num_computed_tokens=request.num_computed_tokens,
             lora_request=request.lora_request,
+            prompt_embeds=request.prompt_embeds,
         )
 
-    def __repr__(self):
+    def __repr__(self) -> str:
+        prompt_embeds_shape = (self.prompt_embeds.shape
+                               if self.prompt_embeds else None)
         return (f"NewRequestData("
                 f"req_id={self.req_id},"
                 f"prompt_token_ids={self.prompt_token_ids},"
@@ -59,19 +64,26 @@ class NewRequestData:
                 f"sampling_params={self.sampling_params},"
                 f"block_ids={self.block_ids},"
                 f"num_computed_tokens={self.num_computed_tokens},"
-                f"lora_request={self.lora_request}"
+                f"lora_request={self.lora_request},"
+                f"prompt_embeds_shape={prompt_embeds_shape}"
                 ")")
 
     # Version of __repr__ with the prompt data obfuscated
-    def anon_repr(self):
+    def anon_repr(self) -> str:
+        prompt_token_ids_len = len(
+            self.prompt_token_ids
+        ) if self.prompt_token_ids is not None else None
+        prompt_embeds_shape = (self.prompt_embeds.shape
+                               if self.prompt_embeds else None)
         return (f"NewRequestData("
                 f"req_id={self.req_id},"
-                f"prompt_token_ids_len={len(self.prompt_token_ids)},"
+                f"prompt_token_ids_len={prompt_token_ids_len},"
                 f"mm_features={self.mm_features},"
                 f"sampling_params={self.sampling_params},"
                 f"block_ids={self.block_ids},"
                 f"num_computed_tokens={self.num_computed_tokens},"
-                f"lora_request={self.lora_request}"
+                f"lora_request={self.lora_request},"
+                f"prompt_embeds_shape={prompt_embeds_shape}"
                 ")")
 
 
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index dec4abec519bd..345f5a464c2cc 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -47,7 +47,7 @@ class EngineCoreRequest(
         gc=False):  # type: ignore[call-arg]
 
     request_id: str
-    prompt_token_ids: list[int]
+    prompt_token_ids: Optional[list[int]]
     mm_features: Optional[list[MultiModalFeatureSpec]]
     sampling_params: Optional[SamplingParams]
     pooling_params: Optional[PoolingParams]
@@ -56,6 +56,7 @@ class EngineCoreRequest(
     lora_request: Optional[LoRARequest]
     cache_salt: Optional[str]
     data_parallel_rank: Optional[int]
+    prompt_embeds: Optional[torch.Tensor] = None
 
     # Index of the client, used to ensure outputs are sent back to the same
     # client for this request when scaling out the front-end.
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index cf4b06db843bd..8aa36d6a439c1 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -13,6 +13,7 @@ from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
 
 logger = init_logger(__name__)
@@ -179,11 +180,12 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
         self.tokenizer: Tokenizer = tokenizer._tokenizer
 
         # Find a safe place to start.
-        prompt_suffix = request.prompt_token_ids
+        prompt_token_ids = request.prompt_token_ids or []
+        prompt_suffix = prompt_token_ids
         prompt_len = len(prompt_suffix)
         if prompt_len > 4:
             for i in range(4, min(prompt_len + 1, 24)):
-                suffix = request.prompt_token_ids[-i:]
+                suffix = prompt_token_ids[-i:]
                 if '�' not in self.tokenizer.decode(suffix):
                     prompt_suffix = suffix
                     break
@@ -260,16 +262,25 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
         params = request.sampling_params
         assert params is not None
 
-        # Metadata for incremental detokenization.
-        self.tokens, self.prefix_offset, self.read_offset = (
-            convert_prompt_ids_to_tokens(
-                tokenizer=tokenizer,
-                prompt_ids=request.prompt_token_ids,
-                skip_special_tokens=params.skip_special_tokens,
-            ))
+        self.prompt_len = length_from_prompt_token_ids_or_embeds(
+            request.prompt_token_ids, request.prompt_embeds)
 
-        self.token_ids.extend(request.prompt_token_ids)
-        self.prompt_len = len(request.prompt_token_ids)
+        # Metadata for incremental detokenization.
+        if request.prompt_token_ids is not None:
+            self.tokens, self.prefix_offset, self.read_offset = (
+                convert_prompt_ids_to_tokens(
+                    tokenizer=tokenizer,
+                    prompt_ids=request.prompt_token_ids,
+                    skip_special_tokens=params.skip_special_tokens,
+                ))
+        else:
+            # Prompt embedding requests cannot be detokenized, in general.
+            self.tokens = [""] * self.prompt_len
+            self.prefix_offset = 0
+            self.read_offest = 0
+
+        self.token_ids.extend(request.prompt_token_ids
+                              or [0] * self.prompt_len)
 
         self.skip_special_tokens = params.skip_special_tokens
         self.spaces_between_special_tokens = (
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 5dad63988daa4..c17dc3e204ecd 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -14,6 +14,7 @@ from vllm.sampling_params import RequestOutputKind
 from vllm.tracing import (SpanAttributes, SpanKind, Tracer,
                           extract_trace_context)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
 from vllm.v1.engine.logprobs import LogprobsProcessor
@@ -86,7 +87,8 @@ class RequestState:
         lora_name: Optional[str],
         output_kind: RequestOutputKind,
         prompt: Optional[str],
-        prompt_token_ids: list[int],
+        prompt_token_ids: Optional[list[int]],
+        prompt_embeds: Optional[torch.Tensor],
         logprobs_processor: Optional[LogprobsProcessor],
         detokenizer: Optional[IncrementalDetokenizer],
         max_tokens_param: Optional[int],
@@ -104,7 +106,9 @@ class RequestState:
         self.output_kind = output_kind
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
-        self.prompt_len = len(prompt_token_ids)
+        self.prompt_embeds = prompt_embeds
+        self.prompt_len = length_from_prompt_token_ids_or_embeds(
+            self.prompt_token_ids, self.prompt_embeds)
         self.logprobs_processor = logprobs_processor
         self.detokenizer = detokenizer
         self.max_tokens_param = max_tokens_param
@@ -165,6 +169,7 @@ class RequestState:
             output_kind=output_kind,
             prompt=prompt,
             prompt_token_ids=request.prompt_token_ids,
+            prompt_embeds=request.prompt_embeds,
             logprobs_processor=logprobs_processor,
             detokenizer=detokenizer,
             max_tokens_param=max_tokens_param,
@@ -223,6 +228,8 @@ class RequestState:
         first_output = outputs[0]
         if isinstance(first_output, PoolingOutput):
             assert len(outputs) == 1
+            # Prompt embeddings are currently not supported by pooling requests.
+            assert self.prompt_token_ids is not None
             return PoolingRequestOutput(
                 request_id=request_id,
                 outputs=first_output,
@@ -236,10 +243,15 @@ class RequestState:
         else:
             prompt_logprobs = self.logprobs_processor.prompt_logprobs
 
+        # If prompt embeds were used, put placeholder prompt token ids
+        prompt_token_ids = self.prompt_token_ids
+        if prompt_token_ids is None and self.prompt_embeds is not None:
+            prompt_token_ids = [0] * len(self.prompt_embeds)
+
         return RequestOutput(
             request_id=request_id,
             prompt=self.prompt,
-            prompt_token_ids=self.prompt_token_ids,
+            prompt_token_ids=prompt_token_ids,
             prompt_logprobs=prompt_logprobs,
             outputs=cast(list[CompletionOutput], outputs),
             finished=finished,
@@ -469,6 +481,8 @@ class OutputProcessor:
 
         arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9)
         trace_context = extract_trace_context(engine_core_output.trace_headers)
+        prompt_length = length_from_prompt_token_ids_or_embeds(
+            req_state.prompt_token_ids, req_state.prompt_embeds)
         with (self.tracer.start_as_current_span(
                 "llm_request",
                 kind=SpanKind.SERVER,
@@ -488,7 +502,7 @@ class OutputProcessor:
             span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
                                queued_time)
             span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
-                               len(req_state.prompt_token_ids))
+                               prompt_length)
             span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
                                metrics.num_generation_tokens)
             span.set_attribute(
@@ -544,7 +558,8 @@ class OutputProcessor:
         assert req_state.stats is not None
         iteration_stats.update_from_finished_request(
             finish_reason=finish_reason,
-            num_prompt_tokens=len(req_state.prompt_token_ids),
+            num_prompt_tokens=length_from_prompt_token_ids_or_embeds(
+                req_state.prompt_token_ids, req_state.prompt_embeds),
             max_tokens_param=req_state.max_tokens_param,
             req_stats=req_state.stats)
         self.lora_states.finish_request(req_state)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 71f539583a1be..507e2cd3223fd 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -19,6 +19,7 @@ from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.structured_output.backend_guidance import (
     validate_guidance_grammar)
@@ -390,6 +391,16 @@ class Processor:
         self._validate_model_inputs(processed_inputs)
 
         encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        # Mypy does not always properly infer the types of some elements of
+        # discriminated unions of TypedDicts, because of how it handles
+        # inheritance of TypedDict. If we explicitly extract the items we want
+        # we can avoid type errors from using `dict.get` later in the method.
+        prompt_str: Optional[str] = None if decoder_inputs[
+            "type"] == "embeds" else decoder_inputs.get("prompt")
+        prompt_token_ids = decoder_inputs[
+            "prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None
+        prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[
+            "type"] == "embeds" else None
 
         sampling_params = None
         pooling_params = None
@@ -398,9 +409,10 @@ class Processor:
             sampling_params = params.clone()
             # If unset max tokens, then generate up to the max_model_len.
             if sampling_params.max_tokens is None:
-                sampling_params.max_tokens = (
-                    self.model_config.max_model_len -
-                    len(decoder_inputs["prompt_token_ids"]))
+                seq_len = length_from_prompt_token_ids_or_embeds(
+                    prompt_token_ids, prompt_embeds)
+                sampling_params.max_tokens = \
+                    self.model_config.max_model_len - seq_len
             sampling_params.update_from_generation_config(
                 self.generation_config_fields, eos_token_id)
             if self.tokenizer is not None:
@@ -430,9 +442,10 @@ class Processor:
                         identifier=decoder_mm_hashes[modality][idx],
                         mm_position=decoder_mm_positions[modality][idx]))
 
-        return decoder_inputs.get("prompt"), EngineCoreRequest(
+        return prompt_str, EngineCoreRequest(
             request_id=request_id,
-            prompt_token_ids=decoder_inputs["prompt_token_ids"],
+            prompt_token_ids=prompt_token_ids,
+            prompt_embeds=prompt_embeds,
             mm_features=mm_features,
             sampling_params=sampling_params,
             pooling_params=pooling_params,
@@ -461,10 +474,17 @@ class Processor:
     ):
         model_config = self.model_config
 
-        prompt_ids = prompt_inputs["prompt_token_ids"]
+        prompt_ids = None if prompt_inputs[
+            "type"] == "embeds" else prompt_inputs["prompt_token_ids"]
+        prompt_embeds = prompt_inputs["prompt_embeds"] if prompt_inputs[
+            "type"] == "embeds" else None
+        prompt_len = length_from_prompt_token_ids_or_embeds(
+            prompt_ids, prompt_embeds)
         if not prompt_ids:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 pass  # Mllama may have empty encoder inputs for text-only data
+            elif prompt_inputs["type"] == "embeds":
+                pass  # Prompt embeds should not have prompt_ids.
             else:
                 raise ValueError(f"The {prompt_type} prompt cannot be empty")
 
@@ -472,7 +492,7 @@ class Processor:
             tokenizer = None
         else:
             tokenizer = self.tokenizer
-            max_input_id = max(prompt_ids, default=0)
+            max_input_id = max(prompt_ids or [], default=0)
 
             # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
             # self.model_config.get_vocab_size() is the model’s vocab size.
@@ -490,7 +510,7 @@ class Processor:
                     f"Token id {max_input_id} is out of vocabulary")
 
         max_prompt_len = self.model_config.max_model_len
-        if len(prompt_ids) > max_prompt_len:
+        if prompt_len > max_prompt_len:
             if prompt_type == "encoder" and model_config.is_multimodal_model:
                 mm_registry = self.input_preprocessor.mm_registry
                 mm_processor = mm_registry.create_processor(
@@ -514,7 +534,7 @@ class Processor:
                     "number of text tokens.")
 
             raise ValueError(
-                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
+                f"The {prompt_type} prompt (length {prompt_len}) is "
                 f"longer than the maximum model length of {max_prompt_len}. "
                 f"{suggestion}")
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 145af788d2372..ff10fa00c1cf6 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -7,9 +7,12 @@ from collections.abc import Mapping
 from functools import partial
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
+import torch
+
 from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
+from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
                             EngineCoreRequest, FinishReason)
 from vllm.v1.structured_output.request import StructuredOutputRequest
@@ -25,12 +28,13 @@ class Request:
     def __init__(
         self,
         request_id: str,
-        prompt_token_ids: list[int],
+        prompt_token_ids: Optional[list[int]],
         sampling_params: Optional[SamplingParams],
         pooling_params: Optional[PoolingParams],
         eos_token_id: Optional[int],
         client_index: int = 0,
         arrival_time: Optional[float] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
         mm_features: Optional[list[MultiModalFeatureSpec]] = None,
         lora_request: Optional["LoRARequest"] = None,
         structured_output_request: Optional["StructuredOutputRequest"] = None,
@@ -79,9 +83,13 @@ class Request:
                 "sampling_params and pooling_params can't both be unset")
 
         self.prompt_token_ids = prompt_token_ids
-        self.num_prompt_tokens = len(self.prompt_token_ids)
+        self.prompt_embeds = prompt_embeds
+        self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
+            prompt_token_ids, prompt_embeds)
         self._output_token_ids: list[int] = []
-        self._all_token_ids: list[int] = self.prompt_token_ids.copy()
+        self._all_token_ids: list[int] = self.prompt_token_ids.copy(
+        ) if self.prompt_token_ids is not None else [0
+                                                     ] * self.num_prompt_tokens
         self.num_output_placeholders = 0  # Used in async scheduling.
         self.spec_token_ids: list[int] = []
         self.num_computed_tokens = 0
@@ -123,6 +131,7 @@ class Request:
             request_id=request.request_id,
             client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
+            prompt_embeds=request.prompt_embeds,
             mm_features=request.mm_features,
             sampling_params=request.sampling_params,
             pooling_params=request.pooling_params,
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index df944873bcaf3..10cad5b530716 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -243,7 +243,7 @@ class AdapterLogitsProcessor(LogitsProcessor):
     def _new_state(
         self,
         params: SamplingParams,
-        prompt_ids: list[int],
+        prompt_ids: Optional[list[int]],
         output_ids: list[int],
     ) -> Optional[partial[torch.Tensor]]:
         """Return state representation for new request
diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 60f9c0bdb6313..fc655d993cb4c 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -187,7 +187,8 @@ class MinTokensLogitsProcessor(LogitsProcessor):
 
     @staticmethod
     def add_request(
-        params: SamplingParams, _: list[int], output_tok_ids: list[int]
+        params: SamplingParams, _: Optional[list[int]],
+        output_tok_ids: list[int]
     ) -> Optional[tuple[int, Sequence[int], set[int]]]:
         min_tokens = params.min_tokens
         if not min_tokens or len(output_tok_ids) >= min_tokens:
@@ -234,7 +235,8 @@ class MinTokensLogitsProcessor(LogitsProcessor):
 
 def process_dict_updates(
     req_entries: dict[int, T], batch_update: Optional[BatchUpdate],
-    new_state: Callable[[SamplingParams, list[int], list[int]], Optional[T]]
+    new_state: Callable[[SamplingParams, Optional[list[int]], list[int]],
+                        Optional[T]]
 ) -> bool:
     """Utility function to update dict state for sparse LogitsProcessors."""
 
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index 04027359909a6..a84afc2f347a0 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -26,7 +26,7 @@ RemovedRequest = int
 
 # (index, params, prompt_tok_ids, output_tok_ids) tuples for new
 # requests added to the batch.
-AddedRequest = tuple[int, SamplingParams, list[int], list[int]]
+AddedRequest = tuple[int, SamplingParams, Optional[list[int]], list[int]]
 
 # (index 1, index 2, directionality) tuples representing
 # one-way moves or two-way swaps of requests in batch
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index c8375d6f15517..50c1470c67edc 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -174,7 +174,7 @@ class MsgpackEncoder:
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
         # view the tensor as a contiguous 1D array of bytes
-        arr = obj.flatten().contiguous().view(torch.uint8).numpy()
+        arr = obj.flatten().contiguous().cpu().view(torch.uint8).numpy()
         if obj.nbytes < self.size_threshold:
             # Smaller tensors are encoded inline, just like ndarrays.
             data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 6717622efb801..79a392337574f 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -13,7 +13,7 @@ from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.utils import swap_dict_values
+from vllm.utils import length_from_prompt_token_ids_or_embeds, swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
@@ -29,7 +29,7 @@ from vllm.v1.worker.block_table import MultiGroupBlockTable
 class CachedRequestState:
 
     req_id: str
-    prompt_token_ids: list[int]
+    prompt_token_ids: Optional[list[int]]
     mm_features: list[MultiModalFeatureSpec]
     sampling_params: Optional[SamplingParams]
     pooling_params: Optional[PoolingParams]
@@ -43,9 +43,11 @@ class CachedRequestState:
     mrope_position_delta: Optional[int] = None
 
     lora_request: Optional[LoRARequest] = None
+    prompt_embeds: Optional[torch.Tensor] = None
 
     def __post_init__(self):
-        self.num_prompt_tokens = len(self.prompt_token_ids)
+        self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
+            self.prompt_token_ids, self.prompt_embeds)
 
     @property
     def num_tokens(self) -> int:
@@ -63,6 +65,10 @@ class CachedRequestState:
 
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:
+            if self.prompt_token_ids is None:
+                raise ValueError(
+                    f"Tried to access token index {idx}, but that token was "
+                    "provided via prompt_embeds, and its ID is unknown.")
             return self.prompt_token_ids[idx]
         elif idx - self.num_prompt_tokens < len(self.output_token_ids):
             return self.output_token_ids[idx - self.num_prompt_tokens]
@@ -109,6 +115,14 @@ class InputBatch:
             pin_memory=False,
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
+        self.is_token_ids = torch.zeros((max_num_reqs, max_model_len),
+                                        device="cpu",
+                                        dtype=bool,
+                                        pin_memory=False)
+        # Store prompt embeddings per request to avoid OOM from large upfront
+        # allocation if max_model_len is big.
+        # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size)
+        self.req_prompt_embeds: dict[int, torch.Tensor] = {}
         self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
@@ -310,15 +324,23 @@ class InputBatch:
         self.req_id_to_index[req_id] = req_index
 
         # Copy the prompt token ids and output token ids.
-        num_prompt_tokens = len(request.prompt_token_ids)
+        num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
+            request.prompt_token_ids, request.prompt_embeds)
         self.num_prompt_tokens[req_index] = num_prompt_tokens
-        self.token_ids_cpu[
-            req_index, :num_prompt_tokens] = request.prompt_token_ids
         start_idx = num_prompt_tokens
         end_idx = start_idx + len(request.output_token_ids)
+        if request.prompt_token_ids is not None:
+            self.token_ids_cpu[
+                req_index, :num_prompt_tokens] = request.prompt_token_ids
+            self.is_token_ids[req_index, :num_prompt_tokens] = True
+        else:
+            self.is_token_ids[req_index, :num_prompt_tokens] = False
+        if request.prompt_embeds is not None:
+            self.req_prompt_embeds[req_index] = request.prompt_embeds
         self.token_ids_cpu[req_index,
                            start_idx:end_idx] = request.output_token_ids
-        # Number of token ids in token_ids_cpu.
+        self.is_token_ids[req_index, start_idx:end_idx] = True
+        # Number of token ids in prompt (token_ids_cpu or prompt_embeds).
         # NOTE(woosuk): This may include spec decode tokens.
         self.num_tokens[req_index] = request.num_tokens
         # Number of tokens without spec decode tokens.
@@ -503,6 +525,20 @@ class InputBatch:
         self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
         self.token_ids_cpu[i2, ...] = tmp
 
+        self.is_token_ids[[i1, i2], ...] = self.is_token_ids[[i2, i1], ...]
+
+        # Swap prompt embeddings if they exist
+        embeds_i1 = self.req_prompt_embeds.get(i1)
+        embeds_i2 = self.req_prompt_embeds.get(i2)
+        if embeds_i1 is not None:
+            self.req_prompt_embeds[i2] = embeds_i1
+        else:
+            self.req_prompt_embeds.pop(i2, None)
+        if embeds_i2 is not None:
+            self.req_prompt_embeds[i1] = embeds_i2
+        else:
+            self.req_prompt_embeds.pop(i1, None)
+
         self.block_table.swap_row(i1, i2)
 
         self.request_lora_mapping[i1], self.request_lora_mapping[i2] = \
@@ -592,6 +628,11 @@ class InputBatch:
             num_tokens = self.num_tokens[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
                 last_req_index, :num_tokens]
+            self.is_token_ids[empty_index, :num_tokens] = self.is_token_ids[
+                last_req_index, :num_tokens]
+            if last_req_index in self.req_prompt_embeds:
+                self.req_prompt_embeds[
+                    empty_index] = self.req_prompt_embeds.pop(last_req_index)
             self.num_tokens[empty_index] = num_tokens
             self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
                 last_req_index]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 053e8f0537ed9..3ee2160a42ffe 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -56,7 +56,9 @@ from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, check_use_alibi, get_dtype_size,
-                        is_pin_memory_available, round_up, supports_dynamo)
+                        is_pin_memory_available,
+                        length_from_prompt_token_ids_or_embeds, round_up,
+                        supports_dynamo)
 from vllm.v1.attention.backends.flash_attn import AttentionMetadata
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
 from vllm.v1.attention.backends.utils import (
@@ -197,6 +199,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 cache_config.cache_dtype]
 
         self.is_pooling_model = (model_config.runner_type == 'pooling')
+        self.enable_prompt_embeds = model_config.enable_prompt_embeds
         self.is_multimodal_raw_input_only_model = (
             model_config.is_multimodal_raw_input_only_model)
 
@@ -342,6 +345,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                                                self.hidden_size,
                                                dtype=self.dtype,
                                                numpy=False)
+        self.is_token_ids = self._make_buffer(self.max_num_tokens,
+                                              dtype=torch.bool)
         self.discard_request_indices = self._make_buffer(self.max_num_reqs,
                                                          dtype=torch.int64)
         self.num_discarded_requests = 0
@@ -574,6 +579,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             req_state = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
+                prompt_embeds=new_req_data.prompt_embeds,
                 mm_features=new_req_data.mm_features,
                 sampling_params=sampling_params,
                 pooling_params=pooling_params,
@@ -819,6 +825,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         if self.input_batch.prev_sampled_token_ids is None:
             # Normal scheduling case
             self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+            self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
+            self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
             return
 
         # Async scheduling case, where some decode requests from the previous
@@ -844,6 +852,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             # If not all requests are decodes from the last iteration,
             # We need to copy the input_ids_cpu to the GPU first.
             self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
+            self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
+            self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
         if num_commmon_tokens == 0:
             # No requests in common with the previous iteration
             # So input_ids_cpu will have all the input ids.
@@ -857,6 +867,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.input_batch.prev_sampled_token_ids[:num_commmon_tokens,
                                                         0],
                 non_blocking=True)
+            self.is_token_ids.gpu[:num_commmon_tokens] = True
             return
         # Upload the index tensors asynchronously
         # so the scatter can be non-blocking.
@@ -947,14 +958,60 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # where M is the max_model_len.
         token_indices = (positions_np +
                          req_indices * self.input_batch.token_ids_cpu.shape[1])
+        token_indices_tensor = torch.from_numpy(token_indices)
 
         # NOTE(woosuk): We use torch.index_select instead of np.take here
         # because torch.index_select is much faster than np.take for large
         # tensors.
         torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
                            0,
-                           torch.from_numpy(token_indices),
+                           token_indices_tensor,
                            out=self.input_ids.cpu[:total_num_scheduled_tokens])
+        is_token_ids = self.input_batch.is_token_ids.flatten()
+        torch.index_select(
+            is_token_ids,
+            0,
+            token_indices_tensor,
+            out=self.is_token_ids.cpu[:total_num_scheduled_tokens])
+
+        # Because we did not pre-allocate a massive prompt_embeds CPU tensor on
+        # the InputBatch, we need to fill in the prompt embeds into the expected
+        # spots in the GpuModelRunner's pre-allocated prompt_embeds tensor.
+        if self.input_batch.req_prompt_embeds:
+            output_idx = 0
+            for req_idx in range(num_reqs):
+                num_sched = num_scheduled_tokens[req_idx]
+
+                # Skip if this request doesn't have embeddings
+                if req_idx not in self.input_batch.req_prompt_embeds:
+                    output_idx += num_sched
+                    continue
+
+                # Skip if no tokens scheduled
+                if num_sched <= 0:
+                    output_idx += num_sched
+                    continue
+
+                req_embeds = self.input_batch.req_prompt_embeds[req_idx]
+                start_pos = self.input_batch.num_computed_tokens_cpu[req_idx]
+
+                # Skip if trying to read beyond available embeddings
+                if start_pos >= req_embeds.shape[0]:
+                    output_idx += num_sched
+                    continue
+
+                # Copy available embeddings
+                end_pos = start_pos + num_sched
+                actual_end = min(end_pos, req_embeds.shape[0])
+                actual_num_sched = actual_end - start_pos
+
+                if actual_num_sched > 0:
+                    self.inputs_embeds.cpu[output_idx:output_idx +
+                                           actual_num_sched].copy_(
+                                               req_embeds[start_pos:actual_end]
+                                           )
+
+                output_idx += num_sched
 
         self.input_batch.block_table.compute_slot_mapping(
             req_indices, positions_np)
@@ -1279,7 +1336,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self.input_batch.num_computed_tokens_cpu[index]
             num_scheduled_tokens = \
                 scheduler_output.num_scheduled_tokens[req_id]
-            num_prompt_tokens = len(req.prompt_token_ids)
+            num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
+                req.prompt_token_ids, req.prompt_embeds)
 
             if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
                 prompt_part_len = max(0,
@@ -1845,6 +1903,32 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 **self._init_model_kwargs(num_scheduled_tokens),
                 **self._extract_mm_kwargs(scheduler_output),
             }
+        elif (self.enable_prompt_embeds and get_pp_group().is_first_rank):
+            # Get the input embeddings for the tokens that are not input embeds,
+            # then put them into the appropriate positions.
+            # TODO(qthequartermasterman): Since even when prompt embeds are
+            # enabled, (a) not all requests will use prompt embeds, and (b)
+            # after the initial prompt is processed, the rest of the generated
+            # tokens will be token ids, it is not desirable to have the
+            # embedding layer outside of the CUDA graph all the time. The v0
+            # engine avoids this by "double compiling" the CUDA graph, once
+            # with input_ids and again with inputs_embeds, for all num_tokens.
+            # If a batch only has token ids, then including the embedding layer
+            # in the CUDA graph will be more performant (like in the else case
+            # below).
+            token_ids_idx = self.is_token_ids.gpu[:num_scheduled_tokens] \
+                .nonzero(as_tuple=False) \
+                .squeeze(1)
+            # Some tokens ids may need to become embeds
+            if token_ids_idx.numel() > 0:
+                token_ids = self.input_ids.gpu[token_ids_idx]
+                tokens_to_embeds = self.model.get_input_embeddings(
+                    input_ids=token_ids)
+                self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds
+
+            inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens]
+            model_kwargs = self._init_model_kwargs(num_input_tokens)
+            input_ids = None
         else:
             # For text-only models, we use token ids as input.
             # While it is possible to use embeddings as input just like the
@@ -2023,6 +2107,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
             self.input_batch.token_ids_cpu[req_idx,
                                            start_idx:end_idx] = sampled_ids
+            self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
             self.input_batch.num_tokens[req_idx] = end_idx
 
@@ -2570,6 +2655,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
             # Get metadata for this request.
             request = self.requests[req_id]
+            if request.prompt_token_ids is None:
+                # Prompt logprobs is incompatible with prompt embeddings
+                continue
+
             num_prompt_tokens = len(request.prompt_token_ids)
             prompt_token_ids = torch.tensor(request.prompt_token_ids).to(
                 self.device, non_blocking=True)
@@ -2922,6 +3011,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     **model_kwargs,
                     **self._dummy_mm_kwargs(num_reqs),
                 }
+            elif self.enable_prompt_embeds:
+                input_ids = None
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
+                model_kwargs = self._init_model_kwargs(num_tokens)
             else:
                 input_ids = self.input_ids.gpu[:num_tokens]
                 inputs_embeds = None
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index dfa54d0ad83b6..4cd0ac352de0f 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -9,7 +9,7 @@ import torch
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingType
-from vllm.utils import swap_dict_values
+from vllm.utils import length_from_prompt_token_ids_or_embeds, swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.worker.block_table import MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState
@@ -213,7 +213,9 @@ class InputBatch:
         self.req_id_to_index[req_id] = req_index
 
         # Copy the prompt token ids and output token ids.
-        num_prompt_tokens = len(request.prompt_token_ids)
+        num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
+            request.prompt_token_ids, request.prompt_embeds)
+        # TODO: copy prompt_embeds
         self.num_prompt_tokens[req_index] = num_prompt_tokens
         self.token_ids_cpu[
             req_index, :num_prompt_tokens] = request.prompt_token_ids
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 43f12912707f1..01a8e5c3f0dba 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -387,6 +387,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
+                prompt_embeds=new_req_data.prompt_embeds,
                 mm_features=new_req_data.mm_features,
                 sampling_params=sampling_params,
                 pooling_params=None,

From 9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Fri, 19 Sep 2025 03:20:51 +0300
Subject: [PATCH 56/58] [KV offload][2/N] Introduce LRU-based CPU offloading
 management (#20075)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 tests/v1/kv_offload/test_cpu.py    | 175 +++++++++++++++++++++++++++++
 vllm/v1/kv_offload/backend.py      |  96 ++++++++++++++++
 vllm/v1/kv_offload/backends/cpu.py |  61 ++++++++++
 vllm/v1/kv_offload/lru_manager.py  | 132 ++++++++++++++++++++++
 4 files changed, 464 insertions(+)
 create mode 100644 tests/v1/kv_offload/test_cpu.py
 create mode 100644 vllm/v1/kv_offload/backend.py
 create mode 100644 vllm/v1/kv_offload/backends/cpu.py
 create mode 100644 vllm/v1/kv_offload/lru_manager.py

diff --git a/tests/v1/kv_offload/test_cpu.py b/tests/v1/kv_offload/test_cpu.py
new file mode 100644
index 0000000000000..cdee7811d85b3
--- /dev/null
+++ b/tests/v1/kv_offload/test_cpu.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Optional
+
+import numpy as np
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
+                                         PrepareStoreOutput)
+from vllm.v1.kv_offload.backends.cpu import CPUBackend
+from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
+
+
+@dataclass
+class ExpectedPrepareStoreOutput:
+    block_hashes_to_store: list[int]
+    store_block_ids: list[int]
+    block_hashes_evicted: list[int]
+
+
+def to_hashes(int_hashes: list[int]) -> list[BlockHash]:
+    return [BlockHash(str(i).encode()) for i in int_hashes]
+
+
+def verify_store_output(
+        prepare_store_output: Optional[PrepareStoreOutput],
+        expected_prepare_store_output: ExpectedPrepareStoreOutput):
+    assert prepare_store_output is not None
+    assert (prepare_store_output.block_hashes_to_store == to_hashes(
+        expected_prepare_store_output.block_hashes_to_store))
+    assert (prepare_store_output.block_hashes_evicted == to_hashes(
+        expected_prepare_store_output.block_hashes_evicted))
+    store_spec = prepare_store_output.store_spec
+    assert isinstance(store_spec, CPULoadStoreSpec)
+    expected_array = np.array(expected_prepare_store_output.store_block_ids,
+                              dtype=np.int64)
+    assert np.array_equal(expected_array, store_spec.block_ids)
+
+
+def verify_load_output(prepare_load_output: LoadStoreSpec,
+                       expected_prepare_load_output: list[int]):
+    assert isinstance(prepare_load_output, CPULoadStoreSpec)
+    expected_array = np.array(expected_prepare_load_output, dtype=np.int64)
+    assert np.array_equal(expected_array, prepare_load_output.block_ids)
+
+
+def verify_events(events: Iterable[OffloadingEvent],
+                  block_size: int,
+                  expected_stores: tuple[set[int], ...] = (),
+                  expected_evictions: tuple[set[int], ...] = ()):
+    stores: list[set[BlockHash]] = []
+    evictions: list[set[BlockHash]] = []
+    for event in events:
+        assert event.medium == CPULoadStoreSpec.medium()
+        assert event.block_size == block_size
+        if event.removed:
+            evictions.append(set(event.block_hashes))
+        else:
+            stores.append(set(event.block_hashes))
+
+    def to_hash_sets(
+            int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]:
+        return tuple([set(to_hashes(list(int_set))) for int_set in int_sets])
+
+    assert tuple(evictions) == to_hash_sets(expected_evictions)
+    assert tuple(stores) == to_hash_sets(expected_stores)
+
+
+def test_cpu_manager():
+    """
+    Tests LRUOffloadingManager with a CPUBackend.
+    """
+    # initialize a CPU backend with a capacity of 4 blocks
+    block_size = 256
+    cpu_backend = CPUBackend(block_size=block_size, num_blocks=4)
+    cpu_manager = LRUOffloadingManager(cpu_backend, enable_events=True)
+
+    # prepare store [1, 2]
+    prepare_store_output = cpu_manager.prepare_store(to_hashes([1, 2]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[1, 2],
+            store_block_ids=[0, 1],
+            block_hashes_evicted=[],
+        ))
+
+    # lookup [1, 2] -> not ready
+    assert cpu_manager.lookup(to_hashes([1, 2])) == 0
+
+    # no events so far
+    assert list(cpu_manager.take_events()) == []
+
+    # complete store [1, 2]
+    cpu_manager.complete_store(to_hashes([1, 2]))
+    verify_events(cpu_manager.take_events(),
+                  block_size=block_size,
+                  expected_stores=({1, 2}, ))
+
+    # lookup [1, 2]
+    assert cpu_manager.lookup(to_hashes([1])) == 1
+    assert cpu_manager.lookup(to_hashes([1, 2])) == 2
+    assert cpu_manager.lookup(to_hashes([1, 2, 3])) == 2
+
+    # prepare store [2, 3, 4, 5] -> evicts [1]
+    prepare_store_output = cpu_manager.prepare_store(to_hashes([2, 3, 4, 5]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[3, 4, 5],
+            store_block_ids=[2, 3, 0],
+            block_hashes_evicted=[1],
+        ))
+
+    # verify eviction event
+    verify_events(cpu_manager.take_events(),
+                  block_size=block_size,
+                  expected_evictions=({1}, ))
+
+    # prepare store with no space
+    assert cpu_manager.prepare_store(to_hashes([1, 6])) is None
+
+    # complete store [2, 3, 4, 5]
+    cpu_manager.complete_store(to_hashes([2, 3, 4, 5]))
+
+    # prepare load [2, 3]
+    prepare_load_output = cpu_manager.prepare_load(to_hashes([2, 3]))
+    verify_load_output(prepare_load_output, [1, 2])
+
+    # prepare store with no space ([2, 3] is being loaded)
+    assert cpu_manager.prepare_store(to_hashes([6, 7, 8])) is None
+
+    # complete load [2, 3]
+    cpu_manager.complete_load(to_hashes([2, 3]))
+
+    # prepare store [6, 7, 8] -> evicts [2, 3, 4] (oldest)
+    prepare_store_output = cpu_manager.prepare_store(to_hashes([6, 7, 8]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[6, 7, 8],
+            store_block_ids=[3, 2, 1],
+            block_hashes_evicted=[2, 3, 4],
+        ))
+
+    # complete store [6, 7, 8]
+    cpu_manager.complete_store(to_hashes([6, 7, 8]))
+
+    # touch [5, 6, 7] (move to end of LRU order)
+    cpu_manager.touch(to_hashes([5, 6, 7]))
+
+    # prepare store [7, 9] -> evicts [8] (oldest following previous touch)
+    prepare_store_output = cpu_manager.prepare_store(to_hashes([9]))
+    verify_store_output(
+        prepare_store_output,
+        ExpectedPrepareStoreOutput(
+            block_hashes_to_store=[9],
+            store_block_ids=[1],
+            block_hashes_evicted=[8],
+        ))
+
+    # complete store [7, 9] with failure
+    cpu_manager.complete_store(to_hashes([7, 9]), success=False)
+
+    # assert [7] is still stored, but [9] is not
+    assert cpu_manager.lookup(to_hashes([7])) == 1
+    assert cpu_manager.lookup(to_hashes([9])) == 0
+
+    verify_events(cpu_manager.take_events(),
+                  block_size=block_size,
+                  expected_stores=({3, 4, 5}, {6, 7, 8}),
+                  expected_evictions=({2, 3, 4}, {8}))
diff --git a/vllm/v1/kv_offload/backend.py b/vllm/v1/kv_offload/backend.py
new file mode 100644
index 0000000000000..87a74200116bb
--- /dev/null
+++ b/vllm/v1/kv_offload/backend.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ctypes
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+
+
+class BlockStatus(ctypes.Structure):
+    """
+    Offloading status for a single block of KV data.
+    Holds the following information:
+
+    ref_cnt - the current number of transfers using this block as a source.
+        A value of -1 indicates the block is not yet ready to be read.
+    load_store_spec - backend-specific information on how to actually
+        read/write the block.
+    """
+    _fields_ = [("ref_cnt", ctypes.c_int32)]
+
+    def __init__(self):
+        super().__init__()
+        # initialize block as "not ready" (ref_cnt = -1)
+        self.ref_cnt = -1
+
+    @property
+    def is_ready(self) -> bool:
+        """
+        Returns whether the block is ready to be read.
+        """
+        return self.ref_cnt >= 0
+
+
+class Backend(ABC):
+    """
+    An abstract class for allocating and returning specs for writing
+    KV blocks to some backend.
+    """
+
+    def __init__(self, block_size: int, medium: str):
+        self.block_size = block_size
+        self.medium = medium
+
+    @abstractmethod
+    def get_num_free_blocks(self):
+        """
+        Returns the number of current number of blocks that can be allocated.
+        """
+        pass
+
+    @abstractmethod
+    def allocate_blocks(self,
+                        block_hashes: list[BlockHash]) -> list[BlockStatus]:
+        """
+        Allocate space for writing blocks.
+        This method assumes there is enough space for allocation.
+        It is unsafe to use without checking get_num_free_blocks beforehand.
+
+        Args:
+            block_hashes: the hashes identifying the blocks to be written.
+
+        Returns:
+            A list of BlockStatus for the allocated blocks.
+            The ref_cnt of each returned item will be -1, meaning the block
+            is not yet ready to be read.
+        """
+        pass
+
+    @abstractmethod
+    def free(self, block: BlockStatus):
+        """
+        Free a previously allocated block.
+        You should only call this function with blocks returned by
+        allocate_blocks, and only once per each block.
+
+        Args:
+            block: The block to be freed.
+        """
+        pass
+
+    def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
+                            blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
+        """
+        Get backend-specific information on how to read/write blocks.
+
+        Args:
+            block_hashes: the list of block hashes identifying the blocks.
+            blocks: the list of blocks.
+
+        Returns:
+            A LoadStoreSpec that can be used by a worker
+            to read/write the blocks.
+        """
+        raise NotImplementedError
diff --git a/vllm/v1/kv_offload/backends/cpu.py b/vllm/v1/kv_offload/backends/cpu.py
new file mode 100644
index 0000000000000..eb1123d1d83ac
--- /dev/null
+++ b/vllm/v1/kv_offload/backends/cpu.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ctypes
+from collections.abc import Iterable
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec
+
+
+class CPUBlockStatus(BlockStatus):
+    _fields_ = BlockStatus._fields_ + [("block_id", ctypes.c_int64)
+                                       ]  # type: ignore
+
+    def __init__(self, block_id: int):
+        super().__init__()
+        self.block_id = block_id
+
+
+class CPUBackend(Backend):
+
+    def __init__(self, block_size: int, num_blocks: int):
+        super().__init__(block_size=block_size,
+                         medium=CPULoadStoreSpec.medium())
+
+        self.num_blocks: int = num_blocks
+        self.num_allocated_blocks: int = 0
+        self.allocated_blocks_free_list: list[int] = []
+
+    def get_num_free_blocks(self):
+        return (len(self.allocated_blocks_free_list) + self.num_blocks -
+                self.num_allocated_blocks)
+
+    def allocate_blocks(self,
+                        block_hashes: list[BlockHash]) -> list[BlockStatus]:
+        num_fresh_blocks = min(len(block_hashes),
+                               self.num_blocks - self.num_allocated_blocks)
+        num_reused_blocks = len(block_hashes) - num_fresh_blocks
+        assert len(self.allocated_blocks_free_list) >= num_reused_blocks
+
+        # allocate fresh blocks
+        blocks: list[BlockStatus] = []
+        for _ in range(num_fresh_blocks):
+            blocks.append(CPUBlockStatus(self.num_allocated_blocks))
+            self.num_allocated_blocks += 1
+
+        # allocate reused blocks
+        for _ in range(num_reused_blocks):
+            block_id = self.allocated_blocks_free_list.pop()
+            blocks.append(CPUBlockStatus(block_id))
+
+        return blocks
+
+    def free(self, block: BlockStatus):
+        assert isinstance(block, CPUBlockStatus)
+        self.allocated_blocks_free_list.append(block.block_id)
+
+    def get_load_store_spec(self, block_hashes: Iterable[BlockHash],
+                            blocks: Iterable[BlockStatus]) -> LoadStoreSpec:
+        return CPULoadStoreSpec([block.block_id for block in blocks])
diff --git a/vllm/v1/kv_offload/lru_manager.py b/vllm/v1/kv_offload/lru_manager.py
new file mode 100644
index 0000000000000..18d3b1d637b32
--- /dev/null
+++ b/vllm/v1/kv_offload/lru_manager.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import OrderedDict
+from collections.abc import Iterable
+from typing import Optional
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent,
+                                         OffloadingManager, PrepareStoreOutput)
+from vllm.v1.kv_offload.backend import Backend, BlockStatus
+
+
+class LRUOffloadingManager(OffloadingManager):
+    """
+    An OffloadingManager with a pluggable backend, which evicts blocks by LRU.
+    """
+
+    def __init__(self, backend: Backend, enable_events: bool = False):
+        self.backend: Backend = backend
+        # block_hash -> BlockStatus
+        self.blocks: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
+        self.events: Optional[list[OffloadingEvent]] = \
+            [] if enable_events else None
+
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        hit_count = 0
+        for block_hash in block_hashes:
+            block = self.blocks.get(block_hash)
+            if block is None or not block.is_ready:
+                break
+            hit_count += 1
+        return hit_count
+
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        blocks = []
+        for block_hash in block_hashes:
+            block = self.blocks[block_hash]
+            assert block.is_ready
+            block.ref_cnt += 1
+            blocks.append(block)
+
+        return self.backend.get_load_store_spec(block_hashes, blocks)
+
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in reversed(list(block_hashes)):
+            if self.blocks.get(block_hash):
+                self.blocks.move_to_end(block_hash)
+
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        for block_hash in block_hashes:
+            block = self.blocks[block_hash]
+            assert block.ref_cnt > 0
+            block.ref_cnt -= 1
+
+    def prepare_store(
+            self,
+            block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
+        # filter out blocks that are already stored
+        block_hashes_to_store = [
+            block_hash for block_hash in block_hashes
+            if block_hash not in self.blocks
+        ]
+
+        num_blocks_to_evict = (len(block_hashes_to_store) -
+                               self.backend.get_num_free_blocks())
+
+        # build list of blocks to evict
+        to_evict = []
+        if num_blocks_to_evict > 0:
+            for block_hash, block in self.blocks.items():
+                if block.ref_cnt == 0:
+                    to_evict.append(block_hash)
+                    num_blocks_to_evict -= 1
+                    if num_blocks_to_evict == 0:
+                        break
+            else:
+                # we could not evict enough blocks
+                return None
+
+        # evict blocks
+        for block_hash in to_evict:
+            self.backend.free(self.blocks.pop(block_hash))
+
+        if to_evict and self.events is not None:
+            self.events.append(
+                OffloadingEvent(block_hashes=to_evict,
+                                block_size=self.backend.block_size,
+                                medium=self.backend.medium,
+                                removed=True))
+
+        blocks = self.backend.allocate_blocks(block_hashes_to_store)
+        assert len(blocks) == len(block_hashes_to_store)
+
+        for block_hash, block in zip(block_hashes_to_store, blocks):
+            self.blocks[block_hash] = block
+
+        # build store specs for allocated blocks
+        store_spec = self.backend.get_load_store_spec(block_hashes_to_store,
+                                                      blocks)
+
+        return PrepareStoreOutput(block_hashes_to_store=block_hashes_to_store,
+                                  store_spec=store_spec,
+                                  block_hashes_evicted=to_evict)
+
+    def complete_store(self,
+                       block_hashes: Iterable[BlockHash],
+                       success: bool = True):
+        stored_block_hashes: list[BlockHash] = []
+        if success:
+            for block_hash in block_hashes:
+                block = self.blocks[block_hash]
+                if not block.is_ready:
+                    block.ref_cnt = 0
+                    stored_block_hashes.append(block_hash)
+        else:
+            for block_hash in block_hashes:
+                block = self.blocks[block_hash]
+                if not block.is_ready:
+                    self.backend.free(block)
+                    del self.blocks[block_hash]
+
+        if stored_block_hashes and self.events is not None:
+            self.events.append(
+                OffloadingEvent(block_hashes=stored_block_hashes,
+                                block_size=self.backend.block_size,
+                                medium=self.backend.medium,
+                                removed=False))
+
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        if self.events is not None:
+            yield from self.events
+            self.events.clear()

From 6d8246aaffff3ebec84767e373212a7b8da328e2 Mon Sep 17 00:00:00 2001
From: Andrew Xia <axia@meta.com>
Date: Thu, 18 Sep 2025 19:11:59 -0700
Subject: [PATCH 57/58] [gpt-oss] Add ResponseReasoningPartAddedEvent,
 ResponseReasoningPartDoneEvent for streaming (#24938)

Signed-off-by: Andrew Xia <axia@meta.com>
---
 .../openai/test_response_api_with_harmony.py  | 56 +++++++++++-
 vllm/entrypoints/openai/protocol.py           | 88 ++++++++++++++-----
 vllm/entrypoints/openai/serving_responses.py  | 32 ++++---
 3 files changed, 143 insertions(+), 33 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index eceaff672112f..8d974d56b4450 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -287,6 +287,57 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
     assert response3.status == "completed"
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_streaming_types(client: OpenAI, model_name: str):
+    prompts = [
+        "tell me a story about a cat in 20 words",
+    ]
+
+    # this links the "done" type with the "start" type
+    # so every "done" type should have a corresponding "start" type
+    # and every open block should be closed by the end of the stream
+    pairs_of_event_types = {
+        "response.completed": "response.created",
+        "response.output_item.done": "response.output_item.added",
+        "response.content_part.done": "response.content_part.added",
+        "response.output_text.done": "response.output_text.delta",
+        "response.web_search_call.done": "response.web_search_call.added",
+        "response.reasoning_text.done": "response.reasoning_text.delta",
+        "response.reasoning_part.done": "response.reasoning_part.added",
+    }
+
+    for prompt in prompts:
+        response = await client.responses.create(
+            model=model_name,
+            input=prompt,
+            reasoning={"effort": "low"},
+            tools=[],
+            stream=True,
+            background=False,
+        )
+
+        stack_of_event_types = []
+        async for event in response:
+            if event.type == 'response.created':
+                stack_of_event_types.append(event.type)
+            elif event.type == 'response.completed':
+                assert stack_of_event_types[-1] == pairs_of_event_types[
+                    event.type]
+                stack_of_event_types.pop()
+            if event.type.endswith("added"):
+                stack_of_event_types.append(event.type)
+            elif event.type.endswith("delta"):
+                if stack_of_event_types[-1] == event.type:
+                    continue
+                stack_of_event_types.append(event.type)
+            elif event.type.endswith("done"):
+                assert stack_of_event_types[-1] == pairs_of_event_types[
+                    event.type]
+                stack_of_event_types.pop()
+        assert len(stack_of_event_types) == 0
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("background", [True, False])
@@ -343,7 +394,10 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
                 assert event.item_id == current_item_id
 
             # verify content_index_id is correct
-            if event.type == "response.content_part.added":
+            if event.type in [
+                    "response.content_part.added",
+                    "response.reasoning_part.added"
+            ]:
                 assert event.content_index != current_content_index
                 current_content_index = event.content_index
             elif event.type in [
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6b54511a66f33..05d5d6d964dd3 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -31,6 +31,8 @@ from openai.types.responses import (
     ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent,
     ResponseStatus, ResponseWebSearchCallCompletedEvent,
     ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent)
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent)
 
 # Backward compatibility for OpenAI client versions
 try:  # For older openai versions (< 1.100.0)
@@ -260,26 +262,6 @@ ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
                                            ResponseReasoningItem,
                                            ResponseFunctionToolCall]
 
-StreamingResponsesResponse: TypeAlias = Union[
-    ResponseCreatedEvent,
-    ResponseInProgressEvent,
-    ResponseCompletedEvent,
-    ResponseOutputItemAddedEvent,
-    ResponseOutputItemDoneEvent,
-    ResponseContentPartAddedEvent,
-    ResponseContentPartDoneEvent,
-    ResponseReasoningTextDeltaEvent,
-    ResponseReasoningTextDoneEvent,
-    ResponseCodeInterpreterCallInProgressEvent,
-    ResponseCodeInterpreterCallCodeDeltaEvent,
-    ResponseWebSearchCallInProgressEvent,
-    ResponseWebSearchCallSearchingEvent,
-    ResponseWebSearchCallCompletedEvent,
-    ResponseCodeInterpreterCallCodeDoneEvent,
-    ResponseCodeInterpreterCallInterpretingEvent,
-    ResponseCodeInterpreterCallCompletedEvent,
-]
-
 
 class ResponsesRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
@@ -1916,6 +1898,72 @@ class ResponsesResponse(OpenAIBaseModel):
         )
 
 
+# TODO: this code can be removed once
+# https://github.com/openai/openai-python/issues/2634 has been resolved
+class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
+    content_index: int
+    """The index of the content part that is done."""
+
+    item_id: str
+    """The ID of the output item that the content part was added to."""
+
+    output_index: int
+    """The index of the output item that the content part was added to."""
+
+    part: ResponseReasoningTextContent
+    """The content part that is done."""
+
+    sequence_number: int
+    """The sequence number of this event."""
+
+    type: Literal["response.reasoning_part.done"]
+    """The type of the event. Always `response.reasoning_part.done`."""
+
+
+# TODO: this code can be removed once
+# https://github.com/openai/openai-python/issues/2634 has been resolved
+class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
+    content_index: int
+    """The index of the content part that is done."""
+
+    item_id: str
+    """The ID of the output item that the content part was added to."""
+
+    output_index: int
+    """The index of the output item that the content part was added to."""
+
+    part: ResponseReasoningTextContent
+    """The content part that is done."""
+
+    sequence_number: int
+    """The sequence number of this event."""
+
+    type: Literal["response.reasoning_part.added"]
+    """The type of the event. Always `response.reasoning_part.added`."""
+
+
+StreamingResponsesResponse: TypeAlias = Union[
+    ResponseCreatedEvent,
+    ResponseInProgressEvent,
+    ResponseCompletedEvent,
+    ResponseOutputItemAddedEvent,
+    ResponseOutputItemDoneEvent,
+    ResponseContentPartAddedEvent,
+    ResponseContentPartDoneEvent,
+    ResponseReasoningTextDeltaEvent,
+    ResponseReasoningTextDoneEvent,
+    ResponseReasoningPartAddedEvent,
+    ResponseReasoningPartDoneEvent,
+    ResponseCodeInterpreterCallInProgressEvent,
+    ResponseCodeInterpreterCallCodeDeltaEvent,
+    ResponseWebSearchCallInProgressEvent,
+    ResponseWebSearchCallSearchingEvent,
+    ResponseWebSearchCallCompletedEvent,
+    ResponseCodeInterpreterCallCodeDoneEvent,
+    ResponseCodeInterpreterCallInterpretingEvent,
+    ResponseCodeInterpreterCallCompletedEvent,
+]
+
 BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest,
                               ScoreRequest, RerankRequest]
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 469d74272b0e6..4894623aeac28 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -58,6 +58,8 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse,
                                               InputTokensDetails,
                                               OutputTokensDetails,
                                               RequestResponseMetadata,
+                                              ResponseReasoningPartAddedEvent,
+                                              ResponseReasoningPartDoneEvent,
                                               ResponsesRequest,
                                               ResponsesResponse, ResponseUsage,
                                               StreamingResponsesResponse)
@@ -1280,14 +1282,13 @@ class OpenAIServingResponses(OpenAIServing):
                         # Deal with tool call here
                         pass
                     elif previous_item.channel == "analysis":
+                        content = ResponseReasoningTextContent(
+                            text=previous_item.content[0].text,
+                            type="reasoning_text",
+                        )
                         reasoning_item = ResponseReasoningItem(
                             type="reasoning",
-                            content=[
-                                ResponseReasoningTextContent(
-                                    text=previous_item.content[0].text,
-                                    type="reasoning_text",
-                                ),
-                            ],
+                            content=[content],
                             status="completed",
                             id=current_item_id,
                             summary=[],
@@ -1301,6 +1302,15 @@ class OpenAIServingResponses(OpenAIServing):
                                 content_index=current_content_index,
                                 text=previous_item.content[0].text,
                             ))
+                        yield _increment_sequence_number_and_return(
+                            ResponseReasoningPartDoneEvent(
+                                type="response.reasoning_part.done",
+                                sequence_number=-1,
+                                item_id=current_item_id,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                part=content,
+                            ))
                         yield _increment_sequence_number_and_return(
                             ResponseOutputItemDoneEvent(
                                 type="response.output_item.done",
@@ -1412,17 +1422,15 @@ class OpenAIServingResponses(OpenAIServing):
                             ))
                         current_content_index += 1
                         yield _increment_sequence_number_and_return(
-                            ResponseContentPartAddedEvent(
-                                type="response.content_part.added",
+                            ResponseReasoningPartAddedEvent(
+                                type="response.reasoning_part.added",
                                 sequence_number=-1,
                                 output_index=current_output_index,
                                 item_id=current_item_id,
                                 content_index=current_content_index,
-                                part=ResponseOutputText(
-                                    type="output_text",
+                                part=ResponseReasoningTextContent(
                                     text="",
-                                    annotations=[],
-                                    logprobs=[],
+                                    type="reasoning_text",
                                 ),
                             ))
                     yield _increment_sequence_number_and_return(

From 1a0a04dae94b7a768c0d59b4f687bcf5e12d3127 Mon Sep 17 00:00:00 2001
From: Chen Ding <dingchen.mail@gmail.com>
Date: Fri, 19 Sep 2025 11:31:16 +0800
Subject: [PATCH 58/58] [Perf] Optimize memory peak during EAGLE model loading.
 (#24585)

Signed-off-by: Chen Ding <candy.dc@alibaba-inc.com>
---
 vllm/model_executor/models/deepseek_eagle.py | 15 ++++++-------
 vllm/model_executor/models/llama4_eagle.py   | 22 +++++++++-----------
 vllm/model_executor/models/llama_eagle.py    | 15 ++++++-------
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index b1d7f24c2f18b..2770ddebc48ab 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -229,14 +229,15 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+
+        def transform(inputs):
+            name, loaded_weight = inputs
+            if "lm_head" not in name:
+                name = "model." + name
+            return name, loaded_weight
+
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=None,
         )
-
-        model_weights = {}
-        for name, loaded_weight in weights:
-            if "lm_head" not in name:
-                name = "model." + name
-            model_weights[name] = loaded_weight
-        loader.load_weights(model_weights.items())
+        loader.load_weights(map(transform, weights))
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index ece490ff2f2a8..a203af53205cd 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -205,23 +205,21 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> None:
+
+        def transform(inputs):
+            name, loaded_weight = inputs
+            name, weight = self.permute_qk_weight_for_rotary(
+                name, loaded_weight)
+            if "lm_head" not in name:
+                name = "model." + name
+            return name, weight
+
         loader = AutoWeightsLoader(
             self,
             # lm_head is tied with target model (Llama4ForCausalLM)
             skip_prefixes=(["lm_head."]),
         )
-
-        model_weights = {}
-        weights = [
-            self.permute_qk_weight_for_rotary(name, loaded_weight)
-            for name, loaded_weight in weights
-        ]
-        for name, loaded_weight in weights:
-            if "lm_head" not in name:
-                name = "model." + name
-            model_weights[name] = loaded_weight
-
-        loader.load_weights(model_weights.items())
+        loader.load_weights(map(transform, weights))
 
     def get_input_embeddings(
         self,
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index a4933b77e3a53..dfae3c3ea5437 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -158,14 +158,15 @@ class EagleLlamaForCausalLM(LlamaForCausalLM):
         return self.model(input_ids, positions, hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+
+        def transform(inputs):
+            name, loaded_weight = inputs
+            if "lm_head" not in name:
+                name = "model." + name
+            return name, loaded_weight
+
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=None,
         )
-
-        model_weights = {}
-        for name, loaded_weight in weights:
-            if "lm_head" not in name:
-                name = "model." + name
-            model_weights[name] = loaded_weight
-        loader.load_weights(model_weights.items())
+        loader.load_weights(map(transform, weights))