From 3127274d022b0bc8ff6ba9ceef41a99a6f01ad2d Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 17 Sep 2025 21:04:21 -0700 Subject: [PATCH 01/58] [MM Encoder] Apply DP ViT for Qwen3-VL model series (#24955) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Roger Wang Signed-off-by: Isotr0py Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Isotr0py --- vllm/model_executor/models/qwen3_vl.py | 94 +++++++++++++++++----- vllm/model_executor/models/qwen3_vl_moe.py | 2 + 2 files changed, 77 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 22948aee4936c..2c36dfbce7f67 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -126,20 +126,23 @@ class Qwen3_VisionMLP(nn.Module): bias: bool = False, act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + prefix: str = "", + use_data_parallel: bool = False): super().__init__() self.linear_fc1 = ColumnParallelLinear(in_features, hidden_features, bias=bias, quant_config=quant_config, return_bias=False, - prefix=f"{prefix}.linear_fc1") + prefix=f"{prefix}.linear_fc1", + disable_tp=use_data_parallel) self.linear_fc2 = RowParallelLinear(hidden_features, in_features, bias=bias, quant_config=quant_config, return_bias=False, - prefix=f"{prefix}.linear_fc2") + prefix=f"{prefix}.linear_fc2", + disable_tp=use_data_parallel) self.act_fn = act_fn def forward(self, x: torch.Tensor): @@ -158,23 +161,27 @@ class Qwen3_VisionBlock(nn.Module): norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() if norm_layer is None: norm_layer = partial(nn.LayerNorm, eps=1e-6) self.norm1 = norm_layer(dim) self.norm2 = norm_layer(dim) - self.attn = Qwen2_5_VisionAttention(embed_dim=dim, - num_heads=num_heads, - projection_size=dim, - quant_config=quant_config, - prefix=f"{prefix}.attn") + self.attn = Qwen2_5_VisionAttention( + embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn", + use_data_parallel=use_data_parallel) self.mlp = Qwen3_VisionMLP(dim, mlp_hidden_dim, act_fn=act_fn, bias=True, quant_config=quant_config, - prefix=f"{prefix}.mlp") + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel) def forward( self, @@ -205,6 +212,7 @@ class Qwen3_VisionPatchMerger(nn.Module): use_postshuffle_norm: bool = False, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.hidden_size = context_dim * (spatial_merge_size**2) @@ -222,13 +230,15 @@ class Qwen3_VisionPatchMerger(nn.Module): self.hidden_size, bias=True, quant_config=quant_config, - prefix=f"{prefix}.linear_fc1") + prefix=f"{prefix}.linear_fc1", + disable_tp=use_data_parallel) self.act_fn = nn.GELU() self.linear_fc2 = RowParallelLinear(self.hidden_size, d_model, bias=True, quant_config=quant_config, - prefix=f"{prefix}.linear_fc2") + prefix=f"{prefix}.linear_fc2", + disable_tp=use_data_parallel) def forward(self, x: torch.Tensor) -> torch.Tensor: if self.use_postshuffle_norm: @@ -250,6 +260,7 @@ class Qwen3_VisionTransformer(nn.Module): norm_eps: float = 1e-6, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.hidden_size = vision_config.hidden_size @@ -260,6 +271,12 @@ class Qwen3_VisionTransformer(nn.Module): self.spatial_merge_unit = self.spatial_merge_size**2 self.temporal_patch_size = vision_config.temporal_patch_size self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes + self.use_data_parallel = use_data_parallel + + # NOTE: This is used for creating empty tensor for all_gather for + # DP ViT. Here out_hidden_size is enlarged due to deepstack + self.out_hidden_size = (vision_config.out_hidden_size * + (1 + len(self.deepstack_visual_indexes))) self.patch_embed = Qwen3_VisionPatchEmbed( patch_size=self.patch_size, @@ -283,7 +300,8 @@ class Qwen3_VisionTransformer(nn.Module): act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], norm_layer=norm_layer, quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") + prefix=f"{prefix}.blocks.{layer_idx}", + use_data_parallel=use_data_parallel) for layer_idx in range(vision_config.depth) ]) @@ -294,6 +312,7 @@ class Qwen3_VisionTransformer(nn.Module): spatial_merge_size=self.spatial_merge_size, quant_config=quant_config, prefix=f"{prefix}.merger", + use_data_parallel=use_data_parallel, ) self.deepstack_merger_list = nn.ModuleList([ @@ -304,7 +323,8 @@ class Qwen3_VisionTransformer(nn.Module): use_postshuffle_norm=True, norm_layer=norm_layer, quant_config=quant_config, - prefix=f"{prefix}.deepstack_merger_list.{layer_idx}") + prefix=f"{prefix}.deepstack_merger_list.{layer_idx}", + use_data_parallel=use_data_parallel) for layer_idx in range(len(self.deepstack_visual_indexes)) ]) @@ -325,7 +345,14 @@ class Qwen3_VisionTransformer(nn.Module): def rot_pos_emb(self, grid_thw): pos_ids = [] - for t, h, w in grid_thw: + # Support both Tensor and list inputs for DP path + if isinstance(grid_thw, list): + grid_list = grid_thw + max_grid_size = max(max(h, w) for _, h, w in grid_list) + else: + grid_list = grid_thw.tolist() + max_grid_size = int(grid_thw[:, 1:].max().item()) + for t, h, w in grid_list: hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) hpos_ids = hpos_ids.reshape( h // self.spatial_merge_size, @@ -348,7 +375,6 @@ class Qwen3_VisionTransformer(nn.Module): pos_ids.append( torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) return rotary_pos_emb @@ -453,10 +479,18 @@ class Qwen3_VisionTransformer(nn.Module): hidden_states = hidden_states + pos_embeds rotary_pos_emb = self.rot_pos_emb(grid_thw) + if isinstance(grid_thw, list): + grid_thw_tensor = torch.tensor(grid_thw, + device=hidden_states.device, + dtype=torch.int32) + else: + grid_thw_tensor = grid_thw + cu_seqlens = torch.repeat_interleave( - grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + grid_thw_tensor[:, 1] * grid_thw_tensor[:, 2], + grid_thw_tensor[:, 0]).cumsum( dim=0, - dtype=grid_thw.dtype + dtype=grid_thw_tensor.dtype if torch.jit.is_tracing() else torch.int32, ) cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) @@ -984,6 +1018,9 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, "up_proj", ], } + + supports_encoder_tp_data = True + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -1009,12 +1046,14 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, self.config = config self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.visual = Qwen3_VisionTransformer( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, ) self.language_model = Qwen3LLMForCausalLM(vllm_config=vllm_config, @@ -1177,7 +1216,15 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + if self.use_data_parallel: + from vllm.multimodal.utils import ( + run_dp_sharded_mrope_vision_model) + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d") + else: + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) # Split concatenated embeddings for each image item. # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync @@ -1199,7 +1246,16 @@ class Qwen3VLForConditionalGeneration(nn.Module, SupportsMultiModal, else: pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + if self.use_data_parallel: + from vllm.multimodal.utils import ( + run_dp_sharded_mrope_vision_model) + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values_videos, + grid_thw_list, + rope_type="rope_3d") + else: + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw) # Split concatenated embeddings for each video item. # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index a800e94ab1e50..d25bc71dcb59b 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -315,12 +315,14 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): self.config = config self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.visual = Qwen3_VisionTransformer( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, ) self.language_model = Qwen3MoeLLMForCausalLM(vllm_config=vllm_config, From 32baf1d03685ead1f5946f867e4ca16007bd10b5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 05:05:18 +0100 Subject: [PATCH 02/58] [Docs] Clean up the contributing README (#25099) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/contributing/README.md | 177 +++++++++--------- .../installation/python_env_setup.inc.md | 2 +- mkdocs.yaml | 1 + 3 files changed, 95 insertions(+), 85 deletions(-) diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 5a2a70d57e85f..b0a95b3b3d3a5 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -26,113 +26,123 @@ See . ## Developing ---8<-- "docs/getting_started/installation/python_env_setup.inc.md" - -Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. -Check out the [building from source][build-from-source] documentation for details. - -For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations. - -### Building the docs with MkDocs - -#### Introduction to MkDocs - -[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file. - -#### Install MkDocs and Plugins - -Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies: - -```bash -uv pip install -r requirements/docs.txt -``` - -!!! note - Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+) - -#### Verify Installation - -Confirm that MkDocs is correctly installed: - -```bash -mkdocs --version -``` - -Example output: - -```console -mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10) -``` - -#### Clone the `vLLM` repository +The first step of contributing to vLLM is to clone the GitHub repository: ```bash git clone https://github.com/vllm-project/vllm.git cd vllm ``` -#### Start the Development Server +Then, configure your Python virtual environment. -MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command: +--8<-- "docs/getting_started/installation/python_env_setup.inc.md" + +If you are only developing vLLM's Python code, install vLLM using: ```bash -mkdocs serve +VLLM_USE_PRECOMPILED=1 uv pip install -e . ``` -Example output: +If you are developing vLLM's Python and CUDA/C++ code, install vLLM using: -```console -INFO - Documentation built in 106.83 seconds -INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml' -INFO - [22:02:02] Serving on http://127.0.0.1:8000/ +```bash +uv pip install -e . ``` -#### View in Your Browser +For more details about installing from source and installing for other hardware, check out the [installation instructions](../getting_started/installation/README.md) for your hardware and head to the "Build wheel from source" section. -Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:. - -#### Learn More - -For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/). - -## Testing - -??? console "Commands" - - ```bash - # These commands are only for Nvidia CUDA platforms. - uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto - - # Linting, formatting and static type checking - pre-commit install - - # You can manually run pre-commit with - pre-commit run --all-files --show-diff-on-failure - - # To manually run something from CI that does not run - # locally by default, you can run: - pre-commit run mypy-3.9 --hook-stage manual --all-files - - # Unit tests - pytest tests/ - - # Run tests for a single test file with detailed output - pytest -s -v tests/test_logger.py - ``` +For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations. !!! tip - Since the ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12. + vLLM is compatible with Python versions 3.9 to 3.12. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12. Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. -!!! note "Install python3-dev if Python.h is missing" +### Linting + +vLLM uses `pre-commit` to lint and format the codebase. See if `pre-commit` is new to you. Setting up `pre-commit` is as easy as: + +```bash +uv pip install pre-commit +pre-commit install +``` + +vLLM's `pre-commit` hooks will now run automatically every time you commit. + +!!! tip "Tips" + You can manually run the `pre-commit` hooks using: + + ```bash + pre-commit run # runs on staged files + pre-commit run -a # runs on all files (short for --all-files) + ``` + + --- + + Some `pre-commit` hooks only run in CI. If you need to, you can run them locally with: + + ```bash + pre-commit run --hook-stage manual markdownlint + pre-commit run --hook-stage manual mypy-3.9 + ``` + +### Documentation + +MkDocs is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file, . + +Get started with: + +```bash +uv pip install -r requirements/docs.txt +``` + +!!! tip + Ensure that your Python version is compatible with the plugins + (e.g., `mkdocs-awesome-nav` requires Python 3.10+) + +MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. +From the root of the repository, run: + +```bash +mkdocs serve # with API ref (~10 minutes) +API_AUTONAV_EXCLUDE=vllm mkdocs serve # API ref off (~15 seconds) +``` + +Once you see `Serving on http://127.0.0.1:8000/` in the logs, the live preview is ready! +Open in your browser to see it. + +For additional features and advanced configurations, refer to the: + +- [MkDocs documentation](https://www.mkdocs.org/) +- [Material for MkDocs documentation](https://squidfunk.github.io/mkdocs-material/) (the MkDocs theme we use) + +### Testing + +vLLM uses `pytest` to test the codebase. + +```bash +# Install the test dependencies used in CI (CUDA only) +uv pip install -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto + +# Install some common test dependencies (hardware agnostic) +uv pip install pytest pytest-asyncio + +# Run all tests +pytest tests/ + +# Run tests for a single test file with detailed output +pytest -s -v tests/test_logger.py +``` + +!!! tip "Install python3-dev if Python.h is missing" If any of the above commands fails with `Python.h: No such file or directory`, install `python3-dev` with `sudo apt install python3-dev`. -!!! note +!!! warning "Warnings" Currently, the repository is not fully checked by `mypy`. -!!! note + --- + Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU platform to run unit tests locally, rely on the continuous integration system to run the tests for now. @@ -194,8 +204,7 @@ appropriately to indicate the type of change. Please use one of the following: The PR needs to meet the following code quality standards: - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). -- Pass all linter checks. Please use `pre-commit` to format your code. See - if `pre-commit` is new to you. +- Pass all linter checks. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md index 423bf9b00d07f..06794f8d3120e 100644 --- a/docs/getting_started/installation/python_env_setup.inc.md +++ b/docs/getting_started/installation/python_env_setup.inc.md @@ -1,4 +1,4 @@ -It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: +It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following commands: ```bash uv venv --python 3.12 --seed diff --git a/mkdocs.yaml b/mkdocs.yaml index 507a80c41e8b4..bbd850bdfee34 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -79,6 +79,7 @@ plugins: - "re:vllm\\._.*" # Internal modules - "vllm.third_party" - "vllm.vllm_flash_attn" + - !ENV [API_AUTONAV_EXCLUDE, ""] - mkdocstrings: handlers: python: From b98219670fb1ca2952d449404c2b4921d7cdce73 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Thu, 18 Sep 2025 05:08:41 +0100 Subject: [PATCH 03/58] [Core][MM] Cleanup `MultiModalCache` (#25006) Signed-off-by: Lukas Geiger --- vllm/multimodal/cache.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 31ae450f4c2ff..297b4c7fa7fbd 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import operator import sys from abc import ABC, abstractmethod from collections.abc import Mapping, Sequence @@ -91,26 +92,15 @@ _V = TypeVar("_V", bound=MultiModalCacheValue) class MultiModalCache: @classmethod - def get_leaf_size( - cls, - leaf: object, - *, - debug: bool = False, - ) -> int: + def get_leaf_size(cls, leaf: object) -> int: if isinstance(leaf, MultiModalProcessorCacheItem): return cls.get_leaf_size(leaf.item) if isinstance(leaf, MultiModalProcessorCacheItemMetadata): return leaf.item_size # These are not subclasses of dict - if isinstance(leaf, MultiModalKwargsItems): - return cls.get_item_size(leaf.data) # type: ignore - if isinstance(leaf, MultiModalKwargsItem): - return cls.get_item_size(leaf.data) # type: ignore - if isinstance(leaf, MultiModalKwargs): - return cls.get_item_size(leaf.data) # type: ignore - - if isinstance(leaf, MultiModalFieldElem): + if isinstance(leaf, (MultiModalKwargs, MultiModalKwargsItems, + MultiModalKwargsItem, MultiModalFieldElem)): return cls.get_item_size(leaf.data) # type: ignore # sys.getsizeof doesn't work for tensors @@ -126,11 +116,8 @@ class MultiModalCache: *, debug: bool = False, ) -> int: - size = json_reduce_leaves( - lambda a, b: a + b, - json_map_leaves(lambda x: cls.get_leaf_size(x, debug=debug), - value), - ) + size = json_reduce_leaves(operator.add, + json_map_leaves(cls.get_leaf_size, value)) if debug: leaf_count = json_count_leaves(value) From 027d37df389b00ed2e7d874113f869267533a2ab Mon Sep 17 00:00:00 2001 From: toncao <130689535+toncao@users.noreply.github.com> Date: Thu, 18 Sep 2025 11:08:50 +0700 Subject: [PATCH 04/58] [Bugfix][Qwen3-Next] add prefixes to shared_expert in qwen3-next and mlp in qwen2moe to successfully load ignored params in quantized models (#24960) Signed-off-by: toncao Co-authored-by: toncao Co-authored-by: Jee Jee Li --- vllm/model_executor/models/qwen2_moe.py | 48 ++++++++++++------------ vllm/model_executor/models/qwen3_next.py | 1 + 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 5e6dea67c9404..6c6276a930453 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -72,17 +72,20 @@ class Qwen2MoeMLP(nn.Module): hidden_act: str, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True, + prefix: str = "", ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( hidden_size, [intermediate_size] * 2, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear(intermediate_size, hidden_size, bias=False, quant_config=quant_config, - reduce_results=reduce_results) + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -123,7 +126,8 @@ class Qwen2MoeSparseMoeBlock(nn.Module): self.gate = ReplicatedLinear(config.hidden_size, config.num_experts, bias=False, - quant_config=None) + quant_config=None, + prefix=f"{prefix}.gate") if config.shared_expert_intermediate_size > 0: self.shared_expert = Qwen2MoeMLP( hidden_size=config.hidden_size, @@ -132,6 +136,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module): quant_config=quant_config, reduce_results=self.experts.must_reduce_shared_expert_outputs( ), + prefix=f"{prefix}.shared_expert", ) else: self.shared_expert = None @@ -203,21 +208,19 @@ class Qwen2MoeAttention(nn.Module): self.max_position_embeddings = max_position_embeddings self.dual_chunk_attention_config = dual_chunk_attention_config - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=True, - quant_config=quant_config, - ) + self.qkv_proj = QKVParallelLinear(hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - quant_config=quant_config, - ) + self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") self.rotary_emb = get_rope( self.head_dim, @@ -296,12 +299,11 @@ class Qwen2MoeDecoderLayer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.mlp") else: - self.mlp = Qwen2MoeMLP( - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - ) + self.mlp = Qwen2MoeMLP(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index fe63e93032352..ca9f4d402dac2 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -138,6 +138,7 @@ class Qwen3NextSparseMoeBlock(nn.Module): quant_config=quant_config, reduce_results=self.experts.must_reduce_shared_expert_outputs( ), + prefix=f"{prefix}.shared_expert", ) else: self.shared_expert = None From dc2979c58574e7a49d17b50c5770010039145aac Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Thu, 18 Sep 2025 00:10:21 -0400 Subject: [PATCH 05/58] [Kernels] Overlap shared experts with combine instead of dispatch (#24254) Signed-off-by: Bill Nell --- .../fused_moe/deepep_ht_prepare_finalize.py | 50 +++++++++- .../fused_moe/deepep_ll_prepare_finalize.py | 55 +++++++++-- .../layers/fused_moe/modular_kernel.py | 95 +++++++++++++++---- .../layers/fused_moe/pplx_prepare_finalize.py | 39 +++++++- 4 files changed, 203 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 5d6b9c87a6b76..f390f0a25875e 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -240,7 +240,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): quant_config) return receiver() - def finalize( + def _finalize( self, output: torch.Tensor, fused_expert_output: torch.Tensor, @@ -248,7 +248,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_ids: torch.Tensor, apply_router_weight_on_input: bool, weight_and_reduce_impl: mk.TopKWeightAndReduce, - ) -> None: + do_async: bool, + ) -> Optional[Callable]: assert self.handle is not None @@ -271,7 +272,46 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_weights=None, config=self._get_combine_config(), previous_event=None, - async_finish=False, + async_finish=do_async, allocate_on_comm_stream=False) - # Respect inplace outputs. - output.copy_(combined_x, non_blocking=True) + + if do_async: + + def _receiver(): + event.current_stream_wait() + # Respect inplace outputs. + output.copy_(combined_x, non_blocking=True) + + return lambda: _receiver() + else: + # Respect inplace outputs. + output.copy_(combined_x, non_blocking=True) + return None + + def finalize_async( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> Callable: + receiver = self._finalize(output, fused_expert_output, topk_weights, + topk_ids, apply_router_weight_on_input, + weight_and_reduce_impl, True) + assert receiver is not None + return receiver + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + self._finalize(output, fused_expert_output, topk_weights, topk_ids, + apply_router_weight_on_input, weight_and_reduce_impl, + False) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index 01df7770463d0..101fc8798c427 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -12,8 +12,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input, normalize_batched_scales_shape) from vllm.v1.worker.ubatching import (dbo_current_ubatch_id, dbo_enabled, - dbo_maybe_run_recv_hook, - dbo_register_recv_hook, dbo_yield) + dbo_maybe_run_recv_hook) # DeepEP kernels quantize dispatch inputs in 128 element chunks. DEEPEP_QUANT_BLOCK_SIZE = 128 @@ -198,7 +197,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): hook() return receiver() - def finalize( + def _finalize( self, output: torch.Tensor, fused_expert_output: torch.Tensor, @@ -206,13 +205,14 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_ids: torch.Tensor, apply_router_weight_on_input: bool, weight_and_reduce_impl: mk.TopKWeightAndReduce, - ) -> None: + do_async: bool, + ) -> Optional[Callable]: assert isinstance( weight_and_reduce_impl, TopKWeightAndReduceDelegate ), ("Weight application and reduction happens in the combine kernel.") a2a_idx = dbo_current_ubatch_id() - do_recv_hook = dbo_enabled() + do_recv_hook = dbo_enabled() or do_async handle = self.handles[a2a_idx] assert handle is not None @@ -232,6 +232,45 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): zero_copy=False, return_recv_hook=do_recv_hook, out=output) - if recv_hook is not None: - dbo_register_recv_hook(recv_hook) - dbo_yield() + + return recv_hook + + def finalize_async( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> Callable: + recv_hook = self._finalize( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + do_async=True, + ) + assert recv_hook is not None + return recv_hook + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + self._finalize( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + do_async=False, + ) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 58cd0294c8c44..729f8e39cf0f7 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -209,7 +209,8 @@ class FusedMoEPrepareAndFinalize(ABC): def supports_async(self) -> bool: """ - Indicates whether or not this class implements prepare_async. + Indicates whether or not this class implements prepare_async and + finalize_async. """ return False @@ -275,6 +276,42 @@ class FusedMoEPrepareAndFinalize(ABC): """ raise NotImplementedError + def finalize_async( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: TopKWeightAndReduce, + ) -> Callable: + """ + Perform any combine plus apply weights and perform a reduction on the + fused experts output but do not wait for results from other workers. + - output: The output tensor, written in place. Must be (M, K) shape. + - fused_expert_output: The unweighted, unreduced output of the fused + experts, it will have (M, topk, K) shape. + - topk_weights: The weights to be applied to the fused_experts_output. + - topk_ids: The topk_ids. + - apply_router_weight_on_input: When False, apply the weights to + fused_expert_output. + - weight_and_reduce_impl: An optional TopKWeightAndReduce + implementation. + + Returns a callback that when invoked waits for results from other + workers and has the same return signature as `finalize`, e.g. + + receiver = obj.finalize_async(output, ...) + ... output not valid yet ... + receiver() + ... output valid here ... + + is equivalent to: + + obj.finalize(output, ...) + """ + raise NotImplementedError + @property @abstractmethod def activation_format(self) -> FusedMoEActivationFormat: @@ -814,23 +851,20 @@ class FusedMoEModularKernel(torch.nn.Module): """ a1 = hidden_states - output = a1 if inplace else torch.zeros_like(a1) + if inplace and self.shared_experts is None: + output = a1 + else: + output = torch.zeros_like(a1) local_num_experts = w1.size(0) if global_num_experts == -1: global_num_experts = local_num_experts - shared_output: torch.Tensor - if not self.prepare_finalize.supports_async(): # We shouldn't be running an a2a kernel that doesn't # support async prepare/finalize assert not dbo_enabled() - # Run shared experts serially with dispatch. - if self.shared_experts is not None: - shared_output = self.shared_experts(a1) - (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids, _expert_topk_weights) = self.prepare_finalize.prepare( a1, @@ -854,9 +888,6 @@ class FusedMoEModularKernel(torch.nn.Module): self.fused_experts.quant_config, ) - if self.shared_experts is not None: - shared_output = self.shared_experts(a1) - # If DBO is being used, register the hook with the ubatch context # and call it in dbo_maybe_run_recv_hook instead of passing it to # the receiver. @@ -900,16 +931,42 @@ class FusedMoEModularKernel(torch.nn.Module): apply_router_weight_on_input=apply_router_weight_on_input, ) - self.prepare_finalize.finalize( - output, - fused_out, - topk_weights, - topk_ids, - apply_router_weight_on_input, - self.fused_experts.finalize_weight_and_reduce_impl(), - ) + shared_output: Optional[torch.Tensor] = None + + if not self.prepare_finalize.supports_async(): + assert not dbo_enabled() + + self.prepare_finalize.finalize( + output, + fused_out, + topk_weights, + topk_ids, + apply_router_weight_on_input, + self.fused_experts.finalize_weight_and_reduce_impl(), + ) + if self.shared_experts is not None: + shared_output = self.shared_experts(a1) + else: + recv_hook = self.prepare_finalize.finalize_async( + output, + fused_out, + topk_weights, + topk_ids, + apply_router_weight_on_input, + self.fused_experts.finalize_weight_and_reduce_impl(), + ) + + if self.shared_experts is not None: + shared_output = self.shared_experts(a1) + + assert recv_hook is not None + dbo_register_recv_hook(recv_hook) + dbo_yield() + if not dbo_enabled(): + recv_hook() if self.shared_experts is None: return output else: + assert shared_output is not None return shared_output, output diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 32d12476dd01a..ddddd2a3b7a2e 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -272,7 +272,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): hook() return receiver() - def finalize( + def finalize_async( self, output: torch.Tensor, fused_expert_output: torch.Tensor, @@ -280,7 +280,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): topk_ids: torch.Tensor, apply_router_weight_on_input: bool, weight_and_reduce_impl: mk.TopKWeightAndReduce, - ) -> None: + ) -> Callable: assert isinstance( weight_and_reduce_impl, TopKWeightAndReduceDelegate ), ("Weight application and reduction happens in the combine kernel.") @@ -303,8 +303,39 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): if apply_router_weight_on_input: topk_weights = torch.ones_like(topk_weights) + topk_ids_u32 = topk_ids.view(dtype=torch.uint32) + self.a2a.combine(out_tokens=output, - indices=topk_ids.view(dtype=torch.uint32), + indices=topk_ids_u32, weights=topk_weights, expert_y=fused_expert_output, - bound_m=bound_m) + bound_m=bound_m, + do_send=True, + do_recv=False) + + return lambda: self.a2a.combine(out_tokens=output, + indices=topk_ids_u32, + weights=topk_weights, + expert_y=fused_expert_output, + bound_m=bound_m, + do_send=False, + do_recv=True) + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + weight_and_reduce_impl: mk.TopKWeightAndReduce, + ) -> None: + receiver = self.finalize_async( + output, + fused_expert_output, + topk_weights, + topk_ids, + apply_router_weight_on_input, + weight_and_reduce_impl, + ) + receiver() From 52bc9d5b3edbf8804758d46cde28024d6c362e42 Mon Sep 17 00:00:00 2001 From: YiwenC <54658925+666even666@users.noreply.github.com> Date: Wed, 17 Sep 2025 21:11:46 -0700 Subject: [PATCH 06/58] [Model] enable data parallel for InternVL vision encoder (#23909) Signed-off-by: Yiwen Chen Signed-off-by: YiwenC <54658925+666even666@users.noreply.github.com> Co-authored-by: Roger Wang --- docs/configuration/optimization.md | 1 + vllm/model_executor/models/intern_vit.py | 107 ++++++++++++++++------- vllm/model_executor/models/internvl.py | 5 +- 3 files changed, 80 insertions(+), 33 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 5807d787cf531..5564d8a81d937 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -175,6 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u Known supported models: - GLM-4.5V GLM-4.1V () +- InternVL () - Kimi-VL () - Llama4 () - MiniCPM-V-2.5 or above (, ) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 8e9ab9649bd44..118cce810a1f2 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -25,9 +25,11 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.multimodal.utils import run_dp_sharded_vision_model NORM2FN = { 'rms_norm': RMSNorm, @@ -137,6 +139,7 @@ class InternParallelAttention(nn.Module): *, num_dummy_heads: int = 0, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() @@ -150,8 +153,10 @@ class InternParallelAttention(nn.Module): f'(got `embed_dim`: {self.embed_dim} and `num_heads`:' f' {self.num_heads}).') - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = (1 if use_data_parallel else + get_tensor_model_parallel_world_size()) + self.tp_rank = (0 if use_data_parallel else + get_tensor_model_parallel_rank()) # Additional dummy heads are used to enable TP for common GPU counts. self.dummy_dim = (num_dummy_heads + self.num_heads) * self.head_dim @@ -159,14 +164,23 @@ class InternParallelAttention(nn.Module): self.tp_size) self.scale = self.head_dim**-0.5 - self.qkv = QKVParallelLinear( - self.embed_dim, - self.head_dim, - num_dummy_heads + self.num_heads, - bias=config.qkv_bias, - quant_config=quant_config, - prefix=f"{prefix}.qkv", - ) + if use_data_parallel: + self.qkv = ReplicatedLinear( + self.embed_dim, + 3 * self.head_dim * self.num_heads, + bias=config.qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + ) + else: + self.qkv = QKVParallelLinear( + self.embed_dim, + self.head_dim, + num_dummy_heads + self.num_heads, + bias=config.qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + ) self.qk_normalization = config.qk_normalization @@ -178,12 +192,20 @@ class InternParallelAttention(nn.Module): eps=config.layer_norm_eps, var_hidden_size=self.embed_dim) - self.proj = RowParallelLinear( - self.dummy_dim, - self.embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - ) + if use_data_parallel: + self.proj = ReplicatedLinear( + self.dummy_dim, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + ) + else: + self.proj = RowParallelLinear( + self.dummy_dim, + self.embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + ) self.attn = MultiHeadAttention(self.num_heads_per_partition, self.head_dim, self.scale) @@ -287,21 +309,26 @@ class InternMLP(nn.Module): config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) - self.fc1 = ColumnParallelLinear(config.hidden_size, - config.intermediate_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.fc1") - self.fc2 = RowParallelLinear(config.intermediate_size, - config.hidden_size, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.fc2") + cls_fc1 = (ReplicatedLinear + if use_data_parallel else ColumnParallelLinear) + self.fc1 = cls_fc1(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc1") + cls_fc2 = (ReplicatedLinear + if use_data_parallel else RowParallelLinear) + self.fc2 = cls_fc2(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -320,6 +347,7 @@ class InternVisionEncoderLayer(nn.Module): *, num_dummy_heads: int = 0, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() @@ -330,11 +358,13 @@ class InternVisionEncoderLayer(nn.Module): self.attn = self._init_attn(config, quant_config, num_dummy_heads=num_dummy_heads, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + use_data_parallel=use_data_parallel) self.mlp = InternMLP(config, quant_config=quant_config, - prefix=f"{prefix}.mlp") + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel) self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) self.norm2 = NORM2FN[self.norm_type](self.embed_dim, @@ -352,16 +382,20 @@ class InternVisionEncoderLayer(nn.Module): *, num_dummy_heads: int, prefix: str = "", + use_data_parallel: bool = False, ): # fallback to sdpa attention if tp unavailable - tp_size = get_tensor_model_parallel_world_size() + # tp_size = get_tensor_model_parallel_world_size() + tp_size = (1 if use_data_parallel else + get_tensor_model_parallel_world_size()) num_heads = config.num_attention_heads if (num_heads + num_dummy_heads) % tp_size == 0: return InternParallelAttention(config, quant_config=quant_config, num_dummy_heads=num_dummy_heads, - prefix=prefix) + prefix=prefix, + use_data_parallel=use_data_parallel) return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads) @@ -388,6 +422,7 @@ class InternVisionEncoder(nn.Module): num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() @@ -402,7 +437,8 @@ class InternVisionEncoder(nn.Module): InternVisionEncoderLayer(config, quant_config, num_dummy_heads=num_dummy_heads, - prefix=f"{prefix}.layers.{layer_idx}") + prefix=f"{prefix}.layers.{layer_idx}", + use_data_parallel=use_data_parallel) for layer_idx in range(num_hidden_layers) ]) @@ -429,10 +465,12 @@ class InternVisionModel(nn.Module): num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.config = config + self.use_data_parallel = use_data_parallel self.embeddings = InternVisionEmbeddings(config) self.encoder = InternVisionEncoder( @@ -441,6 +479,7 @@ class InternVisionModel(nn.Module): num_hidden_layers_override=num_hidden_layers_override, num_dummy_heads=num_dummy_heads, prefix=f"{prefix}.encoder", + use_data_parallel=use_data_parallel, ) def get_input_embeddings(self): @@ -464,7 +503,11 @@ class InternVisionModel(nn.Module): raise ValueError( f'wrong pixel_values size: {pixel_values.shape}') - encoder_outputs = self.encoder(inputs_embeds=hidden_states) + if self.use_data_parallel: + encoder_outputs = run_dp_sharded_vision_model( + hidden_states, self.encoder) + else: + encoder_outputs = self.encoder(inputs_embeds=hidden_states) return encoder_outputs diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 9565628b198e2..6a5c565b52e85 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1035,6 +1035,8 @@ class InternVLMultiModalProcessor( class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): + supports_encoder_tp_data = True + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): @@ -1053,6 +1055,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, self.config = config self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self._patch_quant_config(config, quant_config) image_size = config.force_image_size or config.vision_config.image_size @@ -1120,7 +1123,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers, prefix=prefix, - ) + use_data_parallel=self.use_data_parallel) else: return InternVisionPatchModel(config.vision_config) From bec060fd99e371b1adc53f65636061f702fa8e61 Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Wed, 17 Sep 2025 23:25:07 -0500 Subject: [PATCH 07/58] Mark prompt logprobs as incompatible with prompt embeds at API level (#25077) Signed-off-by: Andrew Sansom --- .../test_completion_with_prompt_embeds.py | 17 +++++++++++++++++ vllm/engine/llm_engine.py | 11 +++++++---- vllm/entrypoints/openai/serving_completion.py | 5 +++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index dbfb1b024f7c2..7b58f851a4d21 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -228,3 +228,20 @@ async def test_completions_with_logprobs_and_prompt_embeds( assert max(logprobs_arg, 1) <= len(top_logprobs) <= logprobs_arg + 1 assert len(logprobs.tokens) == 5 + + +@pytest.mark.asyncio +async def test_prompt_logprobs_raises_error( + client_with_prompt_embeds: openai.AsyncOpenAI): + with pytest.raises(BadRequestError, match="not compatible"): + encoded_embeds = create_dummy_embeds() + await client_with_prompt_embeds.completions.create( + model=MODEL_NAME, + prompt="", + max_tokens=5, + temperature=0.0, + extra_body={ + "prompt_embeds": encoded_embeds, + "prompt_logprobs": True + }, + ) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c35bd20371d0a..34b5dcb587503 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -671,10 +671,13 @@ class LLMEngine: arrival_time = time.time() if (isinstance(prompt, dict) - and prompt.get("prompt_embeds", None) is not None - and not prompt.get("prompt_token_ids", None)): - seq_len = prompt["prompt_embeds"].shape[0] - prompt["prompt_token_ids"] = [0] * seq_len + and prompt.get("prompt_embeds", None) is not None): + if not prompt.get("prompt_token_ids", None): + seq_len = prompt["prompt_embeds"].shape[0] + prompt["prompt_token_ids"] = [0] * seq_len + if params.prompt_logprobs is not None: + raise ValueError( + "prompt_logprobs is not compatible with prompt embeds.") processed_inputs = self.input_preprocessor.preprocess( prompt, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 044f08f32b0d3..0c61c48da0bc8 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -112,6 +112,11 @@ class OpenAIServingCompletion(OpenAIServing): return self.create_error_response( "Echo is unsupported with prompt embeds.") + if (request.prompt_logprobs is not None + and request.prompt_embeds is not None): + return self.create_error_response( + "prompt_logprobs is not compatible with prompt embeds.") + request_id = ( f"cmpl-" f"{self._base_request_id(raw_request, request.request_id)}") From 3bc18127ff1c644257abcf84a1a56fab8c0d3f0c Mon Sep 17 00:00:00 2001 From: Chaojun Zhang Date: Thu, 18 Sep 2025 12:30:10 +0800 Subject: [PATCH 08/58] [XPU] Whisper model support on XPU Platform (#25123) Signed-off-by: chzhang --- vllm/attention/layer.py | 4 ++-- vllm/v1/worker/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 22dc6dcbc8d62..15c0ce33e9659 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -391,8 +391,8 @@ class MultiHeadAttention(nn.Module): backend = _Backend.FLASH_ATTN use_upstream_fa = True - if current_platform.is_rocm(): - # currently, only torch_sdpa is supported on rocm + if current_platform.is_rocm() or current_platform.is_xpu(): + # currently, only torch_sdpa is supported on rocm/xpu self.attn_backend = _Backend.TORCH_SDPA else: diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index fc831a73a75e3..b76ac633892f3 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -282,7 +282,7 @@ def bind_kv_cache( # TODO - analyze where runner_kv_caches is used and the right # way to ensure it properly reflects multiple attention layers # in the same decoder block. - if current_platform.is_cuda(): + if current_platform.is_cuda() or current_platform.is_xpu(): # We know that the GPU runner is not impacted by this # case. Some test code depends on runner_kv_caches, but # not in a way that's impacted by ignoring this. From 9d8a2d86d24b8afd849d18ddb4ef51cec1c0471d Mon Sep 17 00:00:00 2001 From: YiwenC <54658925+666even666@users.noreply.github.com> Date: Wed, 17 Sep 2025 21:51:35 -0700 Subject: [PATCH 09/58] [EPLB] Add EPLB support for hunyuan_v1 (#23078) --- vllm/model_executor/layers/fused_moe/layer.py | 4 +- vllm/model_executor/models/hunyuan_v1.py | 135 ++++++++++++++++-- 2 files changed, 123 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index ae3b67a2b84e6..da513d75da4da 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1508,8 +1508,8 @@ class FusedMoE(CustomOp): return [ weight.view(self.local_num_experts, -1) for name, weight in weights - if name not in NON_EXPERT_WEIGHTS - and not name.startswith("_shared_experts.") + if name not in NON_EXPERT_WEIGHTS and weight.shape != torch.Size( + []) and not name.startswith("_shared_experts.") ] def set_eplb_state( diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index db054b5c537e8..4110c8a1fd08d 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only HunYuan model compatible with HuggingFace weights.""" -from collections.abc import Iterable +import typing +from collections.abc import Callable, Iterable from typing import Any, Optional, Union import regex as re @@ -33,8 +34,8 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import (get_ep_group, get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.activation import SiluAndMul @@ -56,7 +57,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_layers, maybe_prefix) @@ -355,10 +356,16 @@ class HunYuanSparseMoeBlock(nn.Module): quant_config: Optional[QuantizationConfig] = None, layer_id: int = -1, prefix: str = "", + enable_eplb: bool = False, ): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + self.n_routed_experts = config.num_experts + if self.tp_size > config.num_experts: raise ValueError( f"Tensor parallel size {self.tp_size} is greater than " @@ -379,8 +386,23 @@ class HunYuanSparseMoeBlock(nn.Module): config.moe_intermediate_size, int) else config.moe_intermediate_size[layer_id]) + # Load balancing settings. + vllm_config = get_current_vllm_config() + eplb_config = vllm_config.parallel_config.eplb_config + self.enable_eplb = enable_eplb + + self.n_logical_experts = self.n_routed_experts + self.n_redundant_experts = eplb_config.num_redundant_experts + self.n_physical_experts = (self.n_logical_experts + + self.n_redundant_experts) + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + self.physical_expert_start = (self.ep_rank * + self.n_local_physical_experts) + self.physical_expert_end = (self.physical_expert_start + + self.n_local_physical_experts) + self.experts = FusedMoE( - num_experts=config.num_experts, + num_experts=self.n_routed_experts, top_k=top_k, hidden_size=config.hidden_size, intermediate_size=intermediate_size, @@ -388,6 +410,8 @@ class HunYuanSparseMoeBlock(nn.Module): renormalize=top_k > 1, quant_config=quant_config, prefix=f"{prefix}.experts", + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, ) self.gate = ReplicatedLinear(config.hidden_size, @@ -446,6 +470,7 @@ class HunYuanDecoderLayer(nn.Module): quant_config: Optional[QuantizationConfig] = None, prefix: str = "", layer_id: int = -1, + enable_eplb: bool = False, ) -> None: super().__init__() assert layer_id >= 0 @@ -509,6 +534,7 @@ class HunYuanDecoderLayer(nn.Module): quant_config=quant_config, layer_id=layer_id, prefix=f"{prefix}.mlp", + enable_eplb=enable_eplb, ) else: self.mlp = HunYuanMLP( @@ -562,6 +588,9 @@ class HunYuanModel(nn.Module): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config + eplb_config = vllm_config.parallel_config.eplb_config + enable_eplb = vllm_config.parallel_config.enable_eplb + self.num_redundant_experts = eplb_config.num_redundant_experts self.config = config self.quant_config = quant_config @@ -588,6 +617,7 @@ class HunYuanModel(nn.Module): cache_config=cache_config, quant_config=quant_config, prefix=prefix, + enable_eplb=enable_eplb, ), prefix=f"{prefix}.layers", ) @@ -674,6 +704,7 @@ class HunYuanModel(nn.Module): ckpt_down_proj_name="down_proj", ckpt_up_proj_name="up_proj", num_experts=self.config.num_experts, + num_redundant_experts=self.num_redundant_experts, ) else: return [] @@ -803,25 +834,43 @@ class HunYuanModel(nn.Module): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + is_expert_weight = False for mapping in expert_params_mapping: param_name, weight_name, expert_id, shard_id = mapping if weight_name not in name: continue - name = name.replace(weight_name, param_name) - # Skip layers on other devices. - if is_pp_missing_parameter(name, self): + # this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + + # Do not modify `name` since the loop may continue here + # Instead, create a new variable + name_mapped = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name_mapped, self): continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader( + param = params_dict[name_mapped] + # We should ask the weight loader to return success or not + # here since otherwise we may skip experts with other + # available replicas. + weight_loader = typing.cast(Callable[..., bool], + param.weight_loader) + success = weight_loader( param, loaded_weight, - name, + name_mapped, shard_id=shard_id, expert_id=expert_id, + return_success=True, ) - break + if success: + name = name_mapped + break else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) if name is None: @@ -841,7 +890,7 @@ class HunYuanModel(nn.Module): return loaded_params -class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP): +class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -883,6 +932,64 @@ class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP): else: self.lm_head = PPMissingLayer() + # Set MoE hyperparameters + self.expert_weights = [] + self.num_expert_groups = 1 + self.moe_layers: list[FusedMoE] = [] + example_layer = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + + assert isinstance(layer, HunYuanDecoderLayer) + if isinstance(layer.mlp, HunYuanSparseMoeBlock): + example_layer = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_layer is None: + raise RuntimeError("No HunYuanMoE layer found in model.layers.") + + self.num_moe_layers = len(self.moe_layers) + self.num_logical_experts = example_layer.n_logical_experts + self.num_physical_experts = example_layer.n_physical_experts + self.num_local_physical_experts = example_layer.n_local_physical_experts + self.num_routed_experts = example_layer.n_routed_experts + self.num_redundant_experts = example_layer.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + self.expert_weights.append(layer.get_expert_weights()) + # Register the expert weights. + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers: + if isinstance(layer.mlp, HunYuanSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + def forward( self, input_ids: torch.Tensor, From 5c65a72bb17b34bc6eb0d7ca43b10938c88dc7e3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 17 Sep 2025 22:05:25 -0700 Subject: [PATCH 10/58] [V0 Deprecation] Remove more V0 tests (#25117) Signed-off-by: Woosuk Kwon --- .buildkite/test-pipeline.yaml | 6 - .github/CODEOWNERS | 2 - tests/async_engine/__init__.py | 0 tests/async_engine/api_server_async_engine.py | 54 -- tests/async_engine/conftest.py | 12 - tests/async_engine/test_api_server.py | 139 ------ tests/async_engine/test_request_tracker.py | 71 --- tests/basic_correctness/test_preemption.py | 189 ------- tests/detokenizer/conftest.py | 11 - tests/detokenizer/test_stop_checker.py | 83 ---- .../openai/correctness/test_lmeval.py | 10 - tests/samplers/test_logprobs.py | 182 ------- tests/worker/__init__.py | 0 tests/worker/conftest.py | 11 - tests/worker/test_model_input.py | 113 ----- tests/worker/test_model_runner.py | 462 ------------------ tests/worker/test_profile.py | 68 --- tests/worker/test_swap.py | 87 ---- 18 files changed, 1500 deletions(-) delete mode 100644 tests/async_engine/__init__.py delete mode 100644 tests/async_engine/api_server_async_engine.py delete mode 100644 tests/async_engine/conftest.py delete mode 100644 tests/async_engine/test_api_server.py delete mode 100644 tests/async_engine/test_request_tracker.py delete mode 100644 tests/basic_correctness/test_preemption.py delete mode 100644 tests/detokenizer/conftest.py delete mode 100644 tests/detokenizer/test_stop_checker.py delete mode 100644 tests/samplers/test_logprobs.py delete mode 100644 tests/worker/__init__.py delete mode 100644 tests/worker/conftest.py delete mode 100644 tests/worker/test_model_input.py delete mode 100644 tests/worker/test_model_runner.py delete mode 100644 tests/worker/test_profile.py delete mode 100644 tests/worker/test_swap.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0bce02b90a7cd..8dd99bf1a38f6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -46,22 +46,18 @@ steps: mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - - tests/async_engine - tests/test_inputs.py - tests/test_outputs.py - tests/multimodal - tests/utils_ - - tests/worker - tests/standalone_tests/lazy_imports.py - tests/transformers_utils commands: - python3 standalone_tests/lazy_imports.py - - pytest -v -s async_engine # AsyncLLMEngine - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s multimodal - pytest -v -s utils_ # Utils - - pytest -v -s worker # Worker - pytest -v -s transformers_utils # transformers_utils - label: Python-only Installation Test # 10min @@ -82,14 +78,12 @@ steps: - vllm/ - tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_preemption - tests/basic_correctness/test_cumem.py commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - label: Entrypoints Unit Tests # 5min timeout_in_minutes: 10 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 771dd2e172586..b8d6db06548d5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -41,7 +41,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Test ownership /.buildkite/lm-eval-harness @mgoin @simon-mo -/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo /tests/distributed/test_multi_node_assignment.py @youkaichao /tests/distributed/test_pipeline_parallel.py @youkaichao /tests/distributed/test_same_node.py @youkaichao @@ -50,7 +49,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256 /tests/models @DarkLight1337 @ywang96 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche -/tests/prefix_caching @comaniac @KuntaiDu /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm diff --git a/tests/async_engine/__init__.py b/tests/async_engine/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py deleted file mode 100644 index ec6b20f5e04b9..0000000000000 --- a/tests/async_engine/api_server_async_engine.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""vllm.entrypoints.api_server with some extra logging for testing.""" -from collections.abc import Iterable -from typing import Any - -import uvicorn -from fastapi.responses import JSONResponse, Response - -import vllm.entrypoints.api_server -import vllm.envs as envs -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.utils import FlexibleArgumentParser - -app = vllm.entrypoints.api_server.app - - -class AsyncLLMEngineWithStats(AsyncLLMEngine): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._num_aborts = 0 - - async def _engine_abort(self, request_ids: Iterable[str]): - ids = list(request_ids) - self._num_aborts += len(ids) - await super()._engine_abort(ids) - - def testing_stats(self) -> dict[str, Any]: - return {"num_aborted_requests": self._num_aborts} - - -@app.get("/stats") -def stats() -> Response: - """Get the statistics of the engine.""" - return JSONResponse(engine.testing_stats()) - - -if __name__ == "__main__": - parser = FlexibleArgumentParser() - parser.add_argument("--host", type=str, default="localhost") - parser.add_argument("--port", type=int, default=8000) - parser = AsyncEngineArgs.add_cli_args(parser) - args = parser.parse_args() - - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) - vllm.entrypoints.api_server.engine = engine - uvicorn.run(app, - host=args.host, - port=args.port, - log_level="debug", - timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE) diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py deleted file mode 100644 index 375b248ebedaa..0000000000000 --- a/tests/async_engine/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py deleted file mode 100644 index 07370a8803291..0000000000000 --- a/tests/async_engine/test_api_server.py +++ /dev/null @@ -1,139 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import copyreg -import os -import subprocess -import sys -import time -from multiprocessing import Pool -from pathlib import Path - -import pytest -import requests -import urllib3.exceptions - - -def _pickle_new_connection_error(obj): - """Custom pickler for NewConnectionError to fix tblib compatibility.""" - # Extract the original message by removing the "conn: " prefix - full_message = obj.args[0] if obj.args else "" - if ': ' in full_message: - # Split off the connection part and keep the actual message - _, actual_message = full_message.split(': ', 1) - else: - actual_message = full_message - return _unpickle_new_connection_error, (actual_message, ) - - -def _unpickle_new_connection_error(message): - """Custom unpickler for NewConnectionError.""" - # Create with None as conn and the actual message - return urllib3.exceptions.NewConnectionError(None, message) - - -# Register the custom pickle/unpickle functions for tblib compatibility -copyreg.pickle(urllib3.exceptions.NewConnectionError, - _pickle_new_connection_error) - - -def _query_server(prompt: str, max_tokens: int = 5) -> dict: - response = requests.post("http://localhost:8000/generate", - json={ - "prompt": prompt, - "max_tokens": max_tokens, - "temperature": 0, - "ignore_eos": True - }) - response.raise_for_status() - return response.json() - - -def _query_server_long(prompt: str) -> dict: - return _query_server(prompt, max_tokens=500) - - -@pytest.fixture -def api_server(distributed_executor_backend: str): - script_path = Path(__file__).parent.joinpath( - "api_server_async_engine.py").absolute() - commands = [ - sys.executable, - "-u", - str(script_path), - "--model", - "facebook/opt-125m", - "--host", - "127.0.0.1", - "--distributed-executor-backend", - distributed_executor_backend, - ] - - # API Server Test Requires V0. - my_env = os.environ.copy() - my_env["VLLM_USE_V1"] = "0" - uvicorn_process = subprocess.Popen(commands, env=my_env) - yield - uvicorn_process.terminate() - - -@pytest.mark.timeout(300) -@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) -def test_api_server(api_server, distributed_executor_backend: str): - """ - Run the API server and test it. - - We run both the server and requests in separate processes. - - We test that the server can handle incoming requests, including - multiple requests at the same time, and that it can handle requests - being cancelled without crashing. - """ - with Pool(32) as pool: - # Wait until the server is ready - prompts = ["warm up"] * 1 - result = None - while not result: - try: - for r in pool.map(_query_server, prompts): - result = r - break - except requests.exceptions.ConnectionError: - time.sleep(1) - - # Actual tests start here - # Try with 1 prompt - for result in pool.map(_query_server, prompts): - assert result - - num_aborted_requests = requests.get( - "http://localhost:8000/stats").json()["num_aborted_requests"] - assert num_aborted_requests == 0 - - # Try with 100 prompts - prompts = ["test prompt"] * 100 - for result in pool.map(_query_server, prompts): - assert result - - with Pool(32) as pool: - # Cancel requests - prompts = ["canceled requests"] * 100 - pool.map_async(_query_server_long, prompts) - time.sleep(0.01) - pool.terminate() - pool.join() - - # check cancellation stats - # give it some time to update the stats - time.sleep(1) - - num_aborted_requests = requests.get( - "http://localhost:8000/stats").json()["num_aborted_requests"] - assert num_aborted_requests > 0 - - # check that server still runs after cancellations - with Pool(32) as pool: - # Try with 100 prompts - prompts = ["test prompt after canceled"] * 100 - for result in pool.map(_query_server, prompts): - assert result diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py deleted file mode 100644 index 1851eeeda7905..0000000000000 --- a/tests/async_engine/test_request_tracker.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.engine.async_llm_engine import RequestTracker -from vllm.outputs import RequestOutput - - -@pytest.mark.asyncio -async def test_request_tracker(): - tracker = RequestTracker() - stream_1 = tracker.add_request("1") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert len(new) == 1 - assert new[0]["request_id"] == "1" - assert not aborted - assert not stream_1.finished - - stream_2 = tracker.add_request("2") - stream_3 = tracker.add_request("3") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert len(new) == 2 - assert new[0]["request_id"] == "2" - assert new[1]["request_id"] == "3" - assert not aborted - assert not stream_2.finished - assert not stream_3.finished - - # request_ids must be unique - with pytest.raises(KeyError): - tracker.add_request("1") - assert not tracker.new_requests_event.is_set() - - tracker.abort_request("1") - new, aborted = tracker.get_new_and_aborted_requests() - assert len(aborted) == 1 - assert "1" in aborted - assert not new - assert stream_1.finished - - stream_4 = tracker.add_request("4") - tracker.abort_request("4") - assert tracker.new_requests_event.is_set() - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - # aborted new requests will cancel each other out - - # there's no need for them to propagate into the - # engine - assert not aborted - assert not new - assert stream_4.finished - - stream_5 = tracker.add_request("5") - assert tracker.new_requests_event.is_set() - tracker.process_request_output( - RequestOutput("2", "output", [], [], [], finished=True)) - await tracker.wait_for_new_requests() - new, aborted = tracker.get_new_and_aborted_requests() - assert not tracker.new_requests_event.is_set() - assert not aborted - assert len(new) == 1 - assert new[0]["request_id"] == "5" - assert stream_2.finished - assert not stream_5.finished diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py deleted file mode 100644 index db2fa2f6bef6f..0000000000000 --- a/tests/basic_correctness/test_preemption.py +++ /dev/null @@ -1,189 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Compare the short outputs of HF and vLLM when using greedy sampling. - -VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test. - -Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 -pytest tests/basic_correctness/test_preemption.py`. -""" -import pytest -from prometheus_client import REGISTRY - -import vllm.envs as envs -from vllm import SamplingParams -from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, - ENABLE_ARTIFICIAL_PREEMPT) - -from ..models.utils import check_outputs_equal - -MODELS = [ - "distilbert/distilgpt2", -] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT, - so use VLLM_USE_V1=0 for all tests in the file. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -@pytest.fixture(scope="module", autouse=True) -def check_settings(): - assert ENABLE_ARTIFICIAL_PREEMPT is True, ( - "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1." - "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 " - "pytest tests/basic_correctness/test_preemption.py`") - - -@pytest.fixture -def distributed_executor_backend() -> str: - # When SPMD worker is used, use distributed_executor_backend="ray" - # to test delta input optimization works with preemption. - return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [96]) -@pytest.mark.parametrize("chunked_prefill_token_size", [16]) -def test_chunked_prefill_recompute( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - chunked_prefill_token_size: int, - distributed_executor_backend: str, -) -> None: - """Ensure that chunked prefill works with preemption.""" - max_num_seqs = min(chunked_prefill_token_size, 256) - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_batched_tokens = chunked_prefill_token_size - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner( - model, - dtype=dtype, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=enable_chunked_prefill, - max_num_seqs=max_num_seqs, - distributed_executor_backend=distributed_executor_backend, - disable_log_stats=False, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - - for i in range(len(example_prompts)): - hf_output_ids, hf_output_str = hf_outputs[i] - vllm_output_ids, vllm_output_str = vllm_outputs[i] - assert hf_output_str == vllm_output_str, ( - f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") - assert hf_output_ids == vllm_output_ids, ( - f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) -def test_preemption( - caplog_vllm, - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - distributed_executor_backend: str, -) -> None: - """By default, recompute preemption is enabled""" - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner( - model, - dtype=dtype, - disable_log_stats=False, - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - total_preemption = ( - vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - assert ("is preempted by PreemptionMode.RECOMPUTE mode because there " - "is not enough KV cache space." in caplog_vllm.text) - # Ensure the count bucket of request-level histogram metrics matches - # the number of requests as a simple sanity check to ensure metrics are - # generated - preemption_metrics = None - for m in REGISTRY.collect(): - if m.name == "vllm:num_preemptions": - preemption_metrics = m - assert preemption_metrics is not None - total_recorded_preemption = 0 - for sample in preemption_metrics.samples: - total_recorded_preemption += sample.value - assert total_preemption == total_recorded_preemption - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) -def test_preemption_infeasible( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - distributed_executor_backend: str, -) -> None: - """Verify infeasible preemption request will be ignored.""" - BLOCK_SIZE = 16 - prefill_blocks = 2 - decode_blocks = max_tokens // BLOCK_SIZE - with vllm_runner( - model, - dtype=dtype, - block_size=BLOCK_SIZE, - # Not enough gpu blocks to complete a single sequence. - # preemption should happen, and the sequence should be - # ignored instead of hanging forever. - num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, - max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), - distributed_executor_backend=distributed_executor_backend, - ) as vllm_model: - sampling_params = SamplingParams(max_tokens=max_tokens, - ignore_eos=True) - req_outputs = vllm_model.llm.generate( - example_prompts, - sampling_params=sampling_params, - ) - - assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt - < ARTIFICIAL_PREEMPTION_MAX_CNT) - - # Verify the request is ignored and not hang. - for req_output in req_outputs: - outputs = req_output.outputs - assert len(outputs) == 1 - assert outputs[0].finish_reason == "length" diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py deleted file mode 100644 index f2c125355c83c..0000000000000 --- a/tests/detokenizer/conftest.py +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py deleted file mode 100644 index 2ca10c072b342..0000000000000 --- a/tests/detokenizer/test_stop_checker.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm.engine.output_processor.stop_checker import StopChecker -from vllm.inputs import token_inputs -from vllm.sampling_params import SamplingParams -from vllm.sequence import Logprob, Sequence, SequenceStatus - - -def sequence_with_eos(text: str, eos_token: str, - eos_token_id: int) -> Sequence: - """ - Create a Sequence that ends with an EOS token. - """ - seq = Sequence( - seq_id=0, - inputs=token_inputs([]), - block_size=16, - eos_token_id=eos_token_id, - ) - seq.output_text = text + eos_token - - offset = eos_token_id + 1 - for i in range(offset, len(text) + offset): - seq.append_token_id(token_id=i, logprobs={i: Logprob(0.0)}) - seq.append_token_id(token_id=eos_token_id, - logprobs={eos_token_id: Logprob(0.0)}) - - seq.status = SequenceStatus.RUNNING - - return seq - - -@pytest.mark.parametrize(["text_wo_eos", "eos_token", "eos_token_id"], [ - ("This text ends with EOS token", "", 2), -]) -@pytest.mark.parametrize("ignore_eos", [True, False]) -@pytest.mark.parametrize("include_stop_str_in_output", [True, False]) -@pytest.mark.skip_global_cleanup -def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int, - ignore_eos: bool, include_stop_str_in_output: bool): - """ - Test the behavior of the StopChecker's maybe_stop_sequence method - when an EOS token is encountered. - - This test covers: - - When the EOS token should stop the sequence and be removed from the output - - When the EOS token should stop the sequence and be included in the output - - When the EOS token should be ignored, and the sequence continues - """ - - stop_checker = StopChecker(max_model_len=1024) - - seq = sequence_with_eos( - text=text_wo_eos, - eos_token=eos_token, - eos_token_id=eos_token_id, - ) - new_char_count = len(eos_token) - - # Note that `stop` and `stop_token_ids` are not specified - sampling_params = SamplingParams( - min_tokens=1, - ignore_eos=ignore_eos, - include_stop_str_in_output=include_stop_str_in_output) - - stop_checker.maybe_stop_sequence( - seq=seq, - new_char_count=new_char_count, - sampling_params=sampling_params, - ) - - if ignore_eos: - assert seq.status == SequenceStatus.RUNNING - assert seq.output_text == text_wo_eos + eos_token - elif include_stop_str_in_output: - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert seq.output_text == text_wo_eos + eos_token - else: - assert seq.status == SequenceStatus.FINISHED_STOPPED - assert seq.output_text == text_wo_eos diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index 684407cd6ee97..624acd5ffde73 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -81,13 +81,3 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch): more_args = ["--max-num-seqs", "64"] run_test(more_args) - - -@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) -def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch, - more_args): - """Run with the V0 Engine.""" - - with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - run_test(more_args) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py deleted file mode 100644 index 87f40b1005312..0000000000000 --- a/tests/samplers/test_logprobs.py +++ /dev/null @@ -1,182 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm import SamplingParams - -from ..conftest import VllmRunner - -MODELS = ["distilbert/distilgpt2"] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module is V0 only since it uses dtype=float, so - set VLLM_USE_V1=0 for all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", - ["float"]) # needed for comparing logprobs with HF -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) -@pytest.mark.parametrize("num_top_logprobs", [0, 6]) # 32000 == vocab_size -@pytest.mark.parametrize("detokenize", [True, False]) -def test_get_prompt_logprobs( - hf_runner, - vllm_runner, - model, - dtype, - chunked_prefill_token_size: int, - num_top_logprobs: int, - detokenize: bool, - example_prompts, -): - max_num_seqs = 256 - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_seqs = min(chunked_prefill_token_size, max_num_seqs) - max_num_batched_tokens = chunked_prefill_token_size - - max_tokens = 5 - with hf_runner(model, dtype=dtype) as hf_model: - hf_logprobs = hf_model.generate_greedy_logprobs( - example_prompts, - max_tokens=max_tokens, - ) - - with vllm_runner( - model, - dtype=dtype, - max_logprobs=num_top_logprobs, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs, - ) as vllm_model: - vllm_sampling_params = SamplingParams(max_tokens=max_tokens, - logprobs=num_top_logprobs, - prompt_logprobs=num_top_logprobs, - temperature=0.0, - detokenize=detokenize) - vllm_results = vllm_model.llm.generate( - example_prompts, sampling_params=vllm_sampling_params) - - # Test whether logprobs are included in the results. - for result in vllm_results: - assert result.prompt_logprobs is not None - assert result.outputs[0].logprobs is not None - assert len(result.outputs[0].logprobs) == max_tokens - for logprobs in result.outputs[0].logprobs: - # If the output token is not included in the top X - # logprob, it can return 1 more data - assert (len(logprobs) == num_top_logprobs - or len(logprobs) == num_top_logprobs + 1) - output_text = result.outputs[0].text - output_string_from_most_likely_tokens_lst: list[str] = [] - for top_logprobs in result.outputs[0].logprobs: - top_logprob = next(iter(top_logprobs.values())) - output_string_from_most_likely_tokens_lst.append( - top_logprob.decoded_token) - - if detokenize: - output_string_from_most_likely_tokens = "".join( - output_string_from_most_likely_tokens_lst) - assert output_text == output_string_from_most_likely_tokens, ( - "The output text from the top logprob for each token position " - "should be the same as the output text in the result.") - else: - assert output_text == '' - assert output_string_from_most_likely_tokens_lst == ([None] * - max_tokens) - - # The first prompt logprob is always None - assert result.prompt_logprobs[0] is None - for prompt_logprobs in result.prompt_logprobs[1:]: - # If the prompt token is not included in the top X - # logprob, it can return 1 more data - assert (len(prompt_logprobs) == num_top_logprobs - or len(prompt_logprobs) == num_top_logprobs + 1) - - # Test whether prompt logprobs are consistent with HF - for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs): - # Check prompt logprobs - # The first prompt logprob is always None, so we compare it from 1:. - vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:] - for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs): - for token_id, logprob in vllm_prompt_logprob_dict.items(): - torch.testing.assert_close(logprob.logprob, - hf_logprob[0][i][token_id].item(), - atol=1e-2, - rtol=1e-2) - vllm_sample_logprobs = vllm_result.outputs[0].logprobs - for i, top_logprobs in enumerate(vllm_sample_logprobs): - for token_id, sample_logprob in top_logprobs.items(): - logprob = sample_logprob.logprob - torch.testing.assert_close(logprob, - hf_logprob[i][-1][token_id].item(), - atol=1e-2, - rtol=1e-2) - if detokenize: - assert isinstance(sample_logprob.decoded_token, str), ( - "The token should be decoded by the time it is returned" - " to the user.") - - # Test if prompt logprobs are correctly set. - for vllm_result in vllm_results: - token_ids = vllm_result.prompt_token_ids - prompt_logprobs = vllm_result.prompt_logprobs - - # The first token doesn't have logprob. - assert prompt_logprobs[0] is None - - for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]): - assert token_id in logprob_dict - - -def test_max_logprobs(): - runner = VllmRunner("facebook/opt-125m", max_logprobs=1) - vllm_sampling_params = SamplingParams(logprobs=1) - # should pass - runner.generate(["Hello world"], sampling_params=vllm_sampling_params) - - bad_sampling_params = SamplingParams(logprobs=2) - with pytest.raises(ValueError): - runner.generate(["Hello world"], sampling_params=bad_sampling_params) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) -@pytest.mark.parametrize("detokenize", [True, False]) -def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int, - detokenize: bool, example_prompts): - max_num_seqs = 256 - enable_chunked_prefill = False - max_num_batched_tokens = None - if chunked_prefill_token_size != -1: - enable_chunked_prefill = True - max_num_seqs = min(chunked_prefill_token_size, max_num_seqs) - max_num_batched_tokens = chunked_prefill_token_size - max_tokens = 5 - - with vllm_runner( - model, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - max_num_seqs=max_num_seqs, - ) as vllm_model: - sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens, - logprobs=None, - temperature=0.0, - detokenize=detokenize) - results_logprobs_none = vllm_model.llm.generate( - example_prompts, sampling_params=sampling_params_logprobs_none) - - for i in range(len(results_logprobs_none)): - assert results_logprobs_none[i].outputs[0].logprobs is None - assert results_logprobs_none[i].outputs[0].cumulative_logprob is None diff --git a/tests/worker/__init__.py b/tests/worker/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py deleted file mode 100644 index 3f202d4dbe948..0000000000000 --- a/tests/worker/conftest.py +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') \ No newline at end of file diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py deleted file mode 100644 index 0f28ef2ba857b..0000000000000 --- a/tests/worker/test_model_input.py +++ /dev/null @@ -1,113 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import dataclasses - -import torch - -from vllm.attention import AttentionMetadata, AttentionMetadataBuilder -from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.backends.utils import CommonAttentionState -from vllm.model_executor import SamplingMetadata -from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata - - -class MockAttentionBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - raise NotImplementedError - - @staticmethod - def get_impl_cls(): - raise NotImplementedError - - @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: - return AttentionMetadata - - @staticmethod - def get_builder_cls() -> type["AttentionMetadataBuilder"]: - return AttentionMetadataBuilder - - @staticmethod - def get_state_cls() -> type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> tuple[int, ...]: - raise NotImplementedError - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: torch.Tensor, - ) -> None: - pass - - @staticmethod - def copy_blocks( - kv_caches: list[torch.Tensor], - src_to_dists: torch.Tensor, - ) -> None: - pass - - -def test_model_runner_input(): - sampling_metadata = SamplingMetadata( - ["seq_group"], - "selected_token_indices", - "categorized_sample_indices", - "num_prompts", - ) - attn_metadata = AttentionMetadata( - num_prefills=1, - num_prefill_tokens=2, - num_decode_tokens=3, - slot_mapping=torch.zeros(1), - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=True, - ) - model_input = ModelInputForGPUWithSamplingMetadata( - input_tokens=torch.ones(10), - input_positions=torch.ones(10), - sampling_metadata=sampling_metadata, - attn_metadata=attn_metadata) - - assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata) - - # Test round trip serialization. - tensor_dict = model_input.as_broadcastable_tensor_dict() - attn_backend = MockAttentionBackend() - received_model_input = ( - ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( - tensor_dict, attn_backend=attn_backend)) - # Check that received copy has correct values. - assert isinstance(received_model_input, - ModelInputForGPUWithSamplingMetadata) - assert received_model_input.input_tokens is not None - assert ( - received_model_input.input_tokens == model_input.input_tokens).all() - assert received_model_input.input_positions is not None - assert (received_model_input.input_positions == model_input.input_positions - ).all() - assert received_model_input.multi_modal_kwargs is None - assert (received_model_input.multi_modal_kwargs == - model_input.multi_modal_kwargs) - assert received_model_input.lora_requests is None - assert received_model_input.lora_requests == model_input.lora_requests - assert received_model_input.lora_mapping is None - assert received_model_input.lora_mapping == model_input.lora_mapping - for field in dataclasses.fields(AttentionMetadata): - assert getattr(received_model_input.attn_metadata, field.name, - None) == getattr(attn_metadata, field.name, None) - # For sampling metadata, only selected_token_indices is copied. - assert (received_model_input.sampling_metadata.selected_token_indices == - sampling_metadata.selected_token_indices) - assert received_model_input.sampling_metadata.seq_groups is None diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py deleted file mode 100644 index 0be25aa2fc35d..0000000000000 --- a/tests/worker/test_model_runner.py +++ /dev/null @@ -1,462 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.engine.arg_utils import EngineArgs -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import get_open_port -from vllm.worker.model_runner import ModelRunner - - -def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: - engine_args = EngineArgs(model, *args, **kwargs) - engine_config = engine_args.create_engine_config() - model_runner = ModelRunner( - vllm_config=engine_config, - is_driver_worker=True, - ) - return model_runner - - -def test_deepseek_mla_attn_backend_module(): - model_runner = _create_model_runner( - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", - trust_remote_code=True, - enable_chunked_prefill=False, - ) - assert model_runner.attn_backend.__name__ == "TritonMLABackend" - - -@pytest.mark.parametrize("batch_size", list(range(1, 257, 3))) -@pytest.mark.parametrize("use_prompt_embeds", [True, False]) -def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch): - if use_prompt_embeds: - # Prompt Embeddings is only currently supported on V0 - monkeypatch.setenv("VLLM_USE_V1", "0") - - model_runner = _create_model_runner( - "facebook/opt-125m", - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enable_prompt_embeds=True, - ) - - seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - block_tables = {0: [1]} - expected_input_embeds_len = 0 - for i in range(batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_lens.append(seq_len) - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * seq_len, - prompt_embeds=torch.rand(seq_len, 10), - ) - expected_input_embeds_len += seq_len - else: - seq_data = SequenceData.from_seqs(prompt_token_ids=range(seq_len)) - - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - ) - assert seq_group_metadata.token_chunk_size == seq_data.get_len() - seq_group_metadata_list.append(seq_group_metadata) - - expected_selected_token_indices = [] - selected_token_start_idx = 0 - for seq_len in seq_lens: - expected_selected_token_indices.append(selected_token_start_idx + - seq_len - 1) - selected_token_start_idx += seq_len - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - slot_mapping = attn_metadata.slot_mapping - assert return_seq_lens == seq_lens - assert len(slot_mapping) == len(input_tokens) - - # Verify input metadata is correct for prompts. - device = model_runner.device - assert attn_metadata.num_prefills > 0 - assert attn_metadata.num_decode_tokens == 0 - torch.testing.assert_close( - attn_metadata.seq_lens_tensor, - torch.tensor(seq_lens, device=device, dtype=torch.int)) - assert attn_metadata.seq_lens == seq_lens - assert attn_metadata.max_prefill_seq_len == max(seq_lens) - assert attn_metadata.max_decode_seq_len == 0 - - # Test subquery start locs. - start_idx = 0 - start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - start_loc.append(start_idx) - torch.testing.assert_close( - attn_metadata.query_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device)) - - # Test seq start locs. Note that for normal prefill it is - # equivalent to query_start_loc. - start_idx = 0 - seq_start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - seq_start_loc.append(start_idx) - - torch.testing.assert_close( - attn_metadata.seq_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device)) - torch.testing.assert_close( - attn_metadata.context_lens_tensor, - torch.zeros(attn_metadata.context_lens_tensor.shape[0], - dtype=torch.int, - device=device)) - - expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))], - dtype=torch.int32, - device=model_runner.device) - torch.testing.assert_close(attn_metadata.block_tables, expected) - # Cuda graph should not be used for prerill. - assert attn_metadata.use_cuda_graph is False - - assert len(input_tokens) == sum(seq_lens) - assert len(input_positions) == sum(seq_lens) - if expected_input_embeds_len == 0: - torch.testing.assert_close(input_tokens, input_positions) - assert input_embeds is None - else: - assert len(input_embeds) == expected_input_embeds_len - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=model_runner.device, - pin_memory=model_runner.pin_memory) - assert len(input_tokens) == sum(seq_lens) - assert len(input_positions) == sum(seq_lens) - actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype) - torch.testing.assert_close(actual, expected) - torch.allclose(input_tokens, input_positions) - - actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype) - torch.testing.assert_close(actual, expected) - - -@pytest.mark.parametrize("batch_size", list(range(1, 257, 3))) -@pytest.mark.parametrize("use_prompt_embeds", [True, False]) -def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch): - if use_prompt_embeds: - # Prompt Embeddings is only currently supported on V0 - monkeypatch.setenv("VLLM_USE_V1", "0") - - model_runner = _create_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - enforce_eager=False, - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=False, - enable_prompt_embeds=True, - ) - - context_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - # Assume each seq group finishes prefill. - for i in range(batch_size): - # make sure all tokens fit into one block - context_len = i % (model_runner.block_size - 1) + 1 - context_lens.append(context_len) - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * context_len, - prompt_embeds=torch.rand(context_len, 10), - ) - output_embed = torch.rand(10) - else: - seq_data = SequenceData.from_seqs( - prompt_token_ids=range(context_len)) - output_embed = None - seq_data.update_num_computed_tokens(context_len) - # Append one token ID since prefill is finished. - seq_data.append_token_id(1, 0, output_embed) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=False, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables={0: [1]}, - ) - assert seq_group_metadata.token_chunk_size == 1 - seq_group_metadata_list.append(seq_group_metadata) - - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - slot_mapping = attn_metadata.slot_mapping - - assert len(slot_mapping) == len(input_tokens) - - expected_bs = model_runner.vllm_config.pad_for_cudagraph( - len(seq_group_metadata_list)) - # Verify input metadata is correct for prompts. - device = model_runner.device - assert attn_metadata.num_prefills == 0 - assert attn_metadata.num_prefill_tokens == 0 - seq_lens = [context_len + 1 for context_len in context_lens] - # seq_lens are padded to expected_bs - for _ in range(expected_bs - len(seq_lens)): - seq_lens.append(1) - assert attn_metadata.seq_lens == seq_lens - assert attn_metadata.num_decode_tokens == len(seq_lens) - start_idx = 0 - start_loc = [start_idx] - for _ in context_lens: - # decode has only 1 token for query. - start_idx += 1 - start_loc.append(start_idx) - torch.testing.assert_close( - attn_metadata.query_start_loc, - torch.tensor(start_loc, dtype=torch.int32, device=device)) - - start_idx = 0 - seq_start_loc = [start_idx] - for seq_len in seq_lens: - start_idx += seq_len - seq_start_loc.append(start_idx) - torch.testing.assert_close( - attn_metadata.seq_start_loc, - torch.tensor(seq_start_loc, dtype=torch.int32, device=device)) - - torch.testing.assert_close( - attn_metadata.context_lens_tensor, - torch.tensor(context_lens, dtype=torch.int, device=device)) - assert attn_metadata.max_decode_seq_len == max(seq_lens) - torch.testing.assert_close( - attn_metadata.seq_lens_tensor[:len(seq_lens)], - torch.tensor(seq_lens, dtype=torch.int, device=device)) - - # block table's first index corresponds to each batch, meaning in - # decoding it is each token. - assert attn_metadata.block_tables.shape[0] == len(input_tokens) - # Block table's second dim corresponds to each token's block number. - # It is padded up to - assert attn_metadata.block_tables.shape[1] == ( - model_runner.get_max_block_per_batch()) - assert attn_metadata.use_cuda_graph is True - - assert len(input_tokens) == expected_bs - assert len(input_positions) == expected_bs - if use_prompt_embeds: - expected_input_embeds_length = start_loc[-1] - assert len(input_embeds) == expected_input_embeds_length - assert expected_input_embeds_length <= expected_bs - else: - assert input_embeds is None - - # Verify Sampling - expected_selected_token_indices = [] - for selected_token_start_idx, _ in enumerate(context_lens): - expected_selected_token_indices.append(selected_token_start_idx) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - # query lens is all 1 for decode. - query_lens=[1 for _ in range(len(context_lens))], - device=model_runner.device, - pin_memory=model_runner.pin_memory) - actual = sampling_metadata.selected_token_indices - expected = torch.tensor(expected_selected_token_indices, - device=actual.device, - dtype=actual.dtype) - torch.testing.assert_close(actual, expected) - - -def test_empty_seq_group(): - """Verify prepare prompt and decode returns empty output.""" - model_runner = _create_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - enforce_eager=False, - ) - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - - assert input_tokens is None - assert input_positions is None - assert attn_metadata is None - - model_input = model_runner._prepare_model_input_tensors( - seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - return_seq_lens = model_input.seq_lens - - assert input_tokens is None - assert input_positions is None - assert input_embeds is None - assert attn_metadata is None - assert return_seq_lens is None - - -@pytest.fixture -def distributed_init(): - init_distributed_environment( - world_size=1, - rank=0, - distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}", - local_rank=0) - ensure_model_parallel_initialized(1, 1) - - -@pytest.mark.parametrize("batch_size", list(range(2, 128, 3))) -@pytest.mark.parametrize("enforce_eager", [True, False]) -@pytest.mark.parametrize('use_prompt_embeds', [True, False]) -def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds, - distributed_init, monkeypatch): - if use_prompt_embeds: - # Prompt Embeddings is only currently supported on V0 - monkeypatch.setenv("VLLM_USE_V1", "0") - - model_runner = _create_model_runner( - "facebook/opt-125m", - seed=0, - dtype="float16", - enforce_eager=enforce_eager, - max_num_batched_tokens=100000, - max_num_seqs=100000, - enable_chunked_prefill=True, - enable_prompt_embeds=True, - ) - - # Add prefill requests. - seq_lens: list[int] = [] - seq_group_metadata_list: list[SequenceGroupMetadata] = [] - prefill_metadata_list: list[SequenceGroupMetadata] = [] - decode_metadata_list: list[SequenceGroupMetadata] = [] - block_tables = {0: [1]} - prefill_batch_size = batch_size // 2 - decode_batch_size = batch_size - prefill_batch_size - expected_input_embeds_len = 0 - for i in range(prefill_batch_size): - # make sure all tokens fit into one block - seq_len = i % (model_runner.block_size - 1) + 1 - seq_lens.append(seq_len) - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * seq_len, - prompt_embeds=torch.rand(seq_len, 10), - ) - expected_input_embeds_len += seq_len - else: - seq_data = SequenceData.from_seqs( - prompt_token_ids=range(seq_len), ) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables=block_tables, - ) - assert seq_group_metadata.token_chunk_size == seq_data.get_len() - seq_group_metadata_list.append(seq_group_metadata) - prefill_metadata_list.append(seq_group_metadata) - - # Add decode requests - for i in range(prefill_batch_size, batch_size): - # make sure all tokens fit into one block - context_len = i % (model_runner.block_size - 1) + 1 - if use_prompt_embeds: - seq_data = SequenceData.from_seqs( - prompt_token_ids=[0] * context_len, - prompt_embeds=torch.rand(context_len, 10), - ) - output_embed = torch.rand(10) - # This also iterates the expected input_embeds, because the model - # needs both the input and output embeddings passed into together - expected_input_embeds_len += 1 - else: - seq_data = SequenceData.from_seqs( - prompt_token_ids=range(context_len), ) - output_embed = None - assert len(seq_data.prompt_token_ids) == context_len - seq_data.append_token_id(1, 0, output_embed) - seq_data.update_num_computed_tokens(context_len) - seq_group_metadata = SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=False, - seq_data={0: seq_data}, - sampling_params=SamplingParams(temperature=0), - block_tables={0: [1]}, - ) - assert seq_group_metadata.token_chunk_size == 1 - seq_group_metadata_list.append(seq_group_metadata) - decode_metadata_list.append(seq_group_metadata) - - model_input = model_runner.prepare_model_input(seq_group_metadata_list) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - input_embeds = model_input.inputs_embeds - attn_metadata = model_input.attn_metadata - - prefill_meta_actual = attn_metadata.prefill_metadata - decode_meta_actual = attn_metadata.decode_metadata - - assert len(attn_metadata.slot_mapping) == len(input_tokens) - assert len(input_positions) == len(input_tokens) - assert attn_metadata.num_prefills == prefill_batch_size - assert attn_metadata.num_decode_tokens == decode_batch_size - assert attn_metadata.num_prefill_tokens == sum(seq_lens) - if expected_input_embeds_len == 0: - assert input_embeds is None - else: - assert len(input_embeds) == expected_input_embeds_len - - # Verify attn metadata is consistent. We don't need to test individual - # values here because they are tested above. - attn_metadata = model_runner._prepare_model_input_tensors( - seq_group_metadata_list).attn_metadata - - for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata), - vars(prefill_meta_actual)): - assert attr_expected[1] == attr_actual[1] - for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata), - vars(decode_meta_actual)): - assert attr_expected[1] == attr_actual[1] diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py deleted file mode 100644 index d8767f700b576..0000000000000 --- a/tests/worker/test_profile.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.engine.arg_utils import EngineArgs -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.worker import Worker - - -def test_gpu_memory_profiling(): - # Tests the gpu profiling that happens in order to determine the number of - # KV cache blocks that we can allocate on the GPU. - # This test mocks the maximum available gpu memory so that it can run on - # any gpu setup. - - # Set up engine args to build a worker. - engine_args = EngineArgs(model="facebook/opt-125m", - dtype="half", - load_format="dummy") - engine_config = engine_args.create_engine_config() - engine_config.cache_config.num_gpu_blocks = 1000 - engine_config.cache_config.num_cpu_blocks = 1000 - - # Create the worker. - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - worker = Worker( - vllm_config=engine_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - is_driver_worker=True, - ) - - # Set 10GiB as the total gpu ram to be device-agnostic - def mock_mem_info(): - current_usage = torch.cuda.memory_stats( - )["allocated_bytes.all.current"] - mock_total_bytes = 10 * 1024**3 - free = mock_total_bytes - current_usage - - return (free, mock_total_bytes) - - from unittest.mock import patch - with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info): - # Load the model so we can profile it - worker.init_device() - worker.load_model() - gpu_blocks, _ = worker.determine_num_available_blocks() - - # Peak vram usage by torch should be 0.47 GiB - # Model weights take 0.25 GiB - # No memory should be allocated outside of torch - # 9.0 GiB should be the utilization target - # 8.28 GiB should be available for the KV cache - block_size = CacheEngine.get_cache_block_size( - engine_config.cache_config, engine_config.model_config, - engine_config.parallel_config) - - expected_blocks = (8.28 * 1024**3) // block_size - - # Check within a small tolerance for portability - # Hardware, kernel, or dependency changes could all affect memory - # utilization. - # A 100 block tolerance here should be about 60MB of wiggle room. - assert abs(gpu_blocks - expected_blocks) < 100 diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py deleted file mode 100644 index 6d9f404ac207b..0000000000000 --- a/tests/worker/test_swap.py +++ /dev/null @@ -1,87 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.engine.arg_utils import EngineArgs -from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.worker.worker import Worker - - -def test_swap() -> None: - # Configure the engine. - engine_args = EngineArgs(model="distilbert/distilgpt2", - dtype="half", - load_format="dummy") - engine_config = engine_args.create_engine_config() - engine_config.cache_config.num_gpu_blocks = 1000 - engine_config.cache_config.num_cpu_blocks = 1000 - - # Create the worker. - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - worker = Worker( - vllm_config=engine_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - is_driver_worker=True, - ) - - # Initialize the worker. - worker.init_device() - worker.load_model() - worker.initialize_cache( - num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, - num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) - - # Randomly initialize the cache. - gpu_cache = worker.cache_engine[0].gpu_cache - cpu_cache = worker.cache_engine[0].cpu_cache - num_layers = len(gpu_cache) - for i in range(num_layers): - gpu_key_cache, gpu_value_cache = gpu_cache[i] - gpu_key_cache.random_() - gpu_value_cache.random_() - cpu_key_cache, cpu_value_cache = cpu_cache[i] - cpu_key_cache.random_() - cpu_value_cache.random_() - - allclose = lambda a, b: torch.allclose( - a.cuda(), b.cuda(), rtol=0.0, atol=0.0) - - # Test swap out. - blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)] - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=[], - blocks_to_swap_in=[], - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=[], - ) - worker.execute_model(execute_model_req=execute_model_req) - - for i in range(num_layers): - gpu_key_cache, gpu_value_cache = gpu_cache[i] - cpu_key_cache, cpu_value_cache = cpu_cache[i] - for src, dst in blocks_to_swap_out: - assert allclose(gpu_key_cache[src], cpu_key_cache[dst]) - assert allclose(gpu_value_cache[src], cpu_value_cache[dst]) - - # Test swap in. - execute_model_req.blocks_to_swap_out = [] - execute_model_req.blocks_to_swap_in = [ - (19, 45), - (67, 23), - (12, 78), - (40, 99), - (1, 71), - ] - worker.execute_model(execute_model_req=execute_model_req) - - for i in range(num_layers): - gpu_key_cache, gpu_value_cache = gpu_cache[i] - cpu_key_cache, cpu_value_cache = cpu_cache[i] - for src, dst in execute_model_req.blocks_to_swap_in: - assert allclose(gpu_key_cache[dst], cpu_key_cache[src]) - assert allclose(gpu_value_cache[dst], cpu_value_cache[src]) From b7433ca1a47732394b1bdea4099d98389515954b Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Thu, 18 Sep 2025 01:07:24 -0400 Subject: [PATCH 11/58] [Spec Decode] Efficient padded speculation (#24539) Signed-off-by: Benjamin Chislett --- tests/v1/spec_decode/test_eagle.py | 179 +++++++++++++++++++- vllm/config/speculative.py | 5 + vllm/v1/spec_decode/eagle.py | 258 +++++++++++++++++++++++++---- vllm/v1/worker/gpu_input_batch.py | 5 +- vllm/v1/worker/gpu_model_runner.py | 164 +++++++++++------- 5 files changed, 507 insertions(+), 104 deletions(-) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index ccab04628a163..e7f6b68fc3f77 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -19,6 +19,8 @@ from vllm.config.load import LoadConfig from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.platforms import current_platform from vllm.v1.spec_decode.eagle import EagleProposer +from vllm.v1.spec_decode.metadata import SpecDecodeMetadata +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch model_dir = "meta-llama/Llama-3.1-8B-Instruct" eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" @@ -64,6 +66,86 @@ def _create_proposer( device=current_platform.device_type) +def test_prepare_next_token_ids(): + """ + Test for prepare_next_token_ids_cpu and prepare_next_token_ids_padded. + Each will produce a device tensor of next_token_ids, taking as input + either the GPU tensor of sampled_token_ids with -1 for rejected tokens, + or the CPU python list[list[int]] with the rejected tokens removed. + """ + device = torch.device(current_platform.device_type) + + num_requests = 4 + num_speculative_tokens = 4 + batch_spec = BatchSpec( + seq_lens=[num_speculative_tokens + 1] * num_requests, + query_lens=[num_speculative_tokens + 1] * num_requests, + ) + + req_ids = [f"req_{i+1}" for i in range(num_requests)] + mock_input_batch = mock.MagicMock(spec=InputBatch) + mock_input_batch.req_ids = req_ids + mock_input_batch.num_reqs = num_requests + mock_input_batch.vocab_size = 100 + + mock_num_scheduled_tokens = {req_id: 0 for req_id in req_ids} + mock_requests = {} + for req_id in req_ids: + mock_request = mock.MagicMock(spec=CachedRequestState) + # Each request will have a backup next token id of 10, 20, 30, 40 + mock_request.get_token_id.return_value = int(req_id.split("_")[1]) * 10 + mock_request.num_computed_tokens = 0 + mock_requests[req_id] = mock_request + + sampled_token_ids = [ + [0, 1, -1, -1, -1], # 1 accepted, 3 rejected, "1" sampled + [0, 1, 2, 3, 4], # all accepted, "4" sampled + [-1, -1, -1, -1, -1], # sampling skipped, use backup token "30" + [-1, -1, -1, -1, -1] # this request will be discarded + ] + sampled_token_ids_tensor = torch.tensor(sampled_token_ids, + dtype=torch.int32, + device=device) + sampled_token_ids_cpu = [[i for i in seq if i != -1] + for seq in sampled_token_ids] + + expected_next_token_ids_cpu = [1, 4, 30, 40] + expected_next_token_ids_tensor = torch.tensor(expected_next_token_ids_cpu, + dtype=torch.int32, + device=device) + + proposer = _create_proposer("eagle", num_speculative_tokens) + + next_token_ids_from_cpu = proposer.prepare_next_token_ids_cpu( + sampled_token_ids_cpu, mock_requests, mock_input_batch, + mock_num_scheduled_tokens) + + assert torch.equal(next_token_ids_from_cpu, expected_next_token_ids_tensor) + + common_attn_metadata = create_common_attn_metadata( + batch_spec, + block_size=16, + device=device, + ) + + discarded_req_indices = torch.tensor([3], dtype=torch.int64, device=device) + num_discarded_reqs = 1 + + expected_valid_sampled_tokens_count = torch.tensor([2, 5, 0, 0], + dtype=torch.int32, + device=device) + + next_token_ids_from_padded, valid_sampled_tokens_count = \ + proposer.prepare_next_token_ids_padded( + common_attn_metadata, sampled_token_ids_tensor, mock_requests, + mock_input_batch, discarded_req_indices, num_discarded_reqs) + + assert torch.equal(next_token_ids_from_padded, + expected_next_token_ids_tensor) + assert torch.equal(valid_sampled_tokens_count, + expected_valid_sampled_tokens_count) + + def test_prepare_inputs(): """ cu_target_query_lens: [0, a, a + b, a + b + c] @@ -90,10 +172,24 @@ def test_prepare_inputs(): device=device, ) - # Rejected tokens per request: [1, 3, 2] - num_rejected_tokens = torch.tensor([1, 3, 2], - dtype=torch.int32, - device=device) + # If there are `k` sampled tokens, then `k-1` tokens are draft tokens + # from the previous iteration, and the last token is the bonus token sampled + # from the base model. + num_draft_tokens = [3, 6, 4] # one less than query_lens + # num rejected tokens is [1, 3, 2] + ACCEPT_TOKEN = 0 + BONUS_TOKEN = 1 + REJECT_TOKEN = -1 + sampled_token_ids = [ + [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, BONUS_TOKEN], + [ + ACCEPT_TOKEN, ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, + REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN + ], + [ACCEPT_TOKEN, ACCEPT_TOKEN, REJECT_TOKEN, REJECT_TOKEN, BONUS_TOKEN] + ] + sampled_token_ids = [[i for i in seq if i != REJECT_TOKEN] + for seq in sampled_token_ids] # Expected calculations: # query_len_per_req = [4, 7, 5] @@ -125,7 +221,7 @@ def test_prepare_inputs(): proposer = _create_proposer("eagle", 1) updated_metadata, token_indices = proposer.prepare_inputs( - common_attn_metadata, num_rejected_tokens.cpu()) + common_attn_metadata, sampled_token_ids, num_draft_tokens) assert torch.equal(updated_metadata.query_start_loc, expected_cu_num_tokens) @@ -133,6 +229,77 @@ def test_prepare_inputs(): assert torch.equal(token_indices, expected_token_indices) +def test_prepare_inputs_padded(): + """ + Input scenario is 3 requests with num_speculative_tokens == 2 and: + - Request 1: query_len = 3, rejected = 1 + - Request 2: query_len = 3, rejected = 0 + - Request 3: query_len = 3, rejected = 2 + + Expected outputs: + token_indices: [0, 1, 2, + 3, 4, 5, + 6, 7, 8] + Reason: Deferred computation should not disturb the original indices. + + token_indices_to_sample: [1, 5, 6] + Reason: After accounting for rejections, these are the valid token positions + from the original indices to sample from. + """ + + device = torch.device(current_platform.device_type) + + expected_token_indices = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8], + dtype=torch.int32, + device=device) + expected_token_indices_to_sample = torch.tensor([1, 5, 6], + dtype=torch.int32, + device=device) + + num_speculative_tokens = 2 + batch_spec = BatchSpec( + seq_lens=[3, 3, 3], + query_lens=[3, 3, 3], + ) + + common_attn_metadata = create_common_attn_metadata( + batch_spec, + block_size=16, + device=device, + ) + + # Needed for cu_num_draft_tokens, which is expected to be [3, 6, 9] + expected_query_start_loc = torch.tensor([0, 3, 6, 9], + dtype=torch.int32, + device=device) + spec_decode_metadata = SpecDecodeMetadata.make_dummy( + draft_token_ids=[[0] * num_speculative_tokens] * 3, + device=device, + ) + + # num_rejected_tokens = [1, 0, 2] + # num_draft_tokens = [2, 2, 2] + # valid_sampled_tokens_count = num_draft_tokens + 1 - num_rejected_tokens + valid_sampled_tokens_count = torch.tensor([2, 3, 1], + dtype=torch.int32, + device=device) + + proposer = _create_proposer("eagle", num_speculative_tokens) + + output_metadata, token_indices, token_indices_to_sample = \ + proposer.prepare_inputs_padded( + common_attn_metadata, + spec_decode_metadata, + valid_sampled_tokens_count) + + assert output_metadata.max_query_len == 3 + assert torch.equal(output_metadata.query_start_loc, + expected_query_start_loc) + assert torch.equal(token_indices, expected_token_indices) + assert torch.equal(token_indices_to_sample, + expected_token_indices_to_sample) + + @pytest.mark.parametrize("method", ["eagle", "eagle3"]) @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform()) @@ -373,6 +540,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): target_positions=target_positions, target_hidden_states=target_hidden_states, next_token_ids=next_token_ids, + last_token_indices=None, common_attn_metadata=common_attn_metadata, sampling_metadata=sampling_metadata) @@ -526,6 +694,7 @@ def test_propose_tree(spec_token_tree): target_positions=target_positions, target_hidden_states=target_hidden_states, next_token_ids=next_token_ids, + last_token_indices=None, common_attn_metadata=common_attn_metadata, sampling_metadata=sampling_metadata) assert result.shape == (batch_size, num_speculative_tokens) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index b2d50e3852337..fca8c28e5c61e 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -83,6 +83,11 @@ class SpeculativeConfig: disable_by_batch_size: Optional[int] = None """Disable speculative decoding for new incoming requests when the number of enqueued requests is larger than this value, if provided.""" + disable_padded_drafter_batch: bool = False + """Disable input padding for speculative decoding. If set to True, + speculative input batches can contain sequences of different lengths, + which may only be supported by certain attention backends. This currently + only affects the EAGLE method of speculation.""" # Ngram proposer configuration prompt_lookup_max: Optional[int] = None diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5154b29405b6e..2a178ddf48777 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -27,6 +27,9 @@ from vllm.v1.attention.backends.triton_attn import TritonAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.spec_decode.metadata import SpecDecodeMetadata +from vllm.v1.utils import CpuGpuBuffer +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.ubatching import dbo_current_ubatch_id logger = init_logger(__name__) @@ -94,20 +97,26 @@ class EagleProposer: dtype=self.dtype, device=device) + # We need +1 here because the arange is used to set query_start_loc, + # which has one more element than batch_size. max_batch_size = vllm_config.scheduler_config.max_num_seqs - self.arange = torch.arange( - # We need +1 here because the arange is used to set query_start_loc, - # which has one more element than batch_size. - max_batch_size + 1, - device=device, - dtype=torch.int32, - ) + max_num_slots_for_arange = max(max_batch_size + 1, self.max_num_tokens) + self.arange = torch.arange(max_num_slots_for_arange, + device=device, + dtype=torch.int32) self.inputs_embeds = torch.zeros( (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device) + self.backup_next_token_ids = CpuGpuBuffer( + max_batch_size, + dtype=torch.int32, + pin_memory=is_pin_memory_available(), + device=device, + with_numpy=True) + # Determine allowed attention backends once during initialization. self.allowed_attn_types: tuple[type[EagleAttentionMetadata], ...] if current_platform.is_rocm(): @@ -156,13 +165,16 @@ class EagleProposer: target_hidden_states: torch.Tensor, # [batch_size] next_token_ids: torch.Tensor, + last_token_indices: Optional[torch.Tensor], common_attn_metadata: CommonAttentionMetadata, sampling_metadata: SamplingMetadata, mm_embeds: Optional[list[torch.Tensor]] = None, ) -> torch.Tensor: num_tokens = target_token_ids.shape[0] batch_size = next_token_ids.shape[0] - last_token_indices = common_attn_metadata.query_start_loc[1:] - 1 + + if last_token_indices is None: + last_token_indices = common_attn_metadata.query_start_loc[1:] - 1 if self.method == "eagle3": assert isinstance(self.model, Eagle3LlamaForCausalLM) @@ -228,6 +240,12 @@ class EagleProposer: last_hidden_states, hidden_states = ret_hidden_states sample_hidden_states = last_hidden_states[last_token_indices] logits = self.model.compute_logits(sample_hidden_states, None) + + # Early exit if there is only one draft token to be generated. + if self.num_speculative_tokens == 1: + draft_token_ids = logits.argmax(dim=-1) + return draft_token_ids.view(-1, 1) + positions = target_positions[last_token_indices] hidden_states = hidden_states[last_token_indices] @@ -245,15 +263,12 @@ class EagleProposer: draft_token_ids = logits.argmax(dim=-1) - # Early exit if there is only one draft token to be generated. - if self.num_speculative_tokens == 1: - # [batch_size, 1] - return draft_token_ids.view(-1, 1) - - # TODO: Currently, MTP module released by deepseek only has - # one layer. Adapt this code to support multiple layers once - # there's a multi-layer MTP module. - assert isinstance(attn_metadata, self.allowed_attn_types) + if not isinstance(attn_metadata, self.allowed_attn_types): + raise ValueError( + f"Unsupported attention metadata type for speculative " + "decoding with num_speculative_tokens > 1: " + f"{type(attn_metadata)}. Supported types are: " + f"{self.allowed_attn_types}") # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] @@ -263,10 +278,13 @@ class EagleProposer: input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) else: input_batch_size = batch_size - attn_metadata.num_actual_tokens = batch_size - attn_metadata.max_query_len = 1 - attn_metadata.query_start_loc = self.arange[:batch_size + 1] - for _ in range(self.num_speculative_tokens - 1): + + common_attn_metadata.num_actual_tokens = batch_size + common_attn_metadata.max_query_len = 1 + common_attn_metadata.query_start_loc = self.arange[:batch_size + 1] + common_attn_metadata.query_start_loc_cpu = torch.from_numpy( + self.token_arange_np[:batch_size + 1]).clone() + for token_index in range(self.num_speculative_tokens - 1): # Update the inputs. # cast to int32 is crucial when eagle model is compiled. # tensor.argmax() returns int64 by default. @@ -286,27 +304,38 @@ class EagleProposer: positions) # Increment the sequence lengths. - attn_metadata.max_seq_len += 1 - attn_metadata.seq_lens += 1 - # Consider max model length. - attn_metadata.max_seq_len = min(attn_metadata.max_seq_len, - self.max_model_len) + common_attn_metadata.seq_lens += 1 + common_attn_metadata.seq_lens_cpu += 1 # For the requests that exceed the max model length, we set the # sequence length to 1 to minimize their overheads in attention. - attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1) + common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, + 1) + + common_attn_metadata.num_computed_tokens_cpu = \ + common_attn_metadata.seq_lens_cpu - 1 # Compute the slot mapping. block_numbers = clamped_positions // self.block_size - block_ids = attn_metadata.block_table.gather( + block_ids = common_attn_metadata.block_table_tensor.gather( dim=1, index=block_numbers.view(-1, 1)) block_ids = block_ids.view(-1) - attn_metadata.slot_mapping = (block_ids * self.block_size + - clamped_positions % self.block_size) + common_attn_metadata.slot_mapping = ( + block_ids * self.block_size + + clamped_positions % self.block_size) # Mask out the slot mappings that exceed the max model length. # Otherwise, the KV cache will be inadvertently updated with the # padding tokens. - attn_metadata.slot_mapping.masked_fill_(exceeds_max_model_len, - PADDING_SLOT_ID) + common_attn_metadata.slot_mapping.masked_fill_( + exceeds_max_model_len, PADDING_SLOT_ID) + + # Rebuild attention metadata + attn_metadata_builder = \ + self.runner.attn_groups[0][0].metadata_builders[ubatch_id] + attn_metadata = attn_metadata_builder\ + .build_for_drafting(common_attn_metadata=common_attn_metadata, + draft_index=token_index + 1) + for layer_name in self.attn_layer_names: + per_layer_attn_metadata[layer_name] = attn_metadata # copy inputs to buffer for cudagraph self.input_ids[:batch_size] = input_ids @@ -347,6 +376,158 @@ class EagleProposer: draft_token_ids = torch.stack(draft_token_ids_list, dim=1) return draft_token_ids + def prepare_next_token_ids_cpu( + self, sampled_token_ids: list[list[int]], + requests: dict[str, + CachedRequestState], gpu_input_batch: InputBatch, + num_scheduled_tokens: dict[str, int]) -> torch.Tensor: + """ + This function is used to prepare the inputs for speculative decoding. + It calculates the next token ids for each request based on the sampled + token ids from the CPU. If a request has no sampled token ids (e.g., + during the initial decoding steps), it falls back to using the request + state to get the next token id. + """ + req_ids = gpu_input_batch.req_ids + next_token_ids: list[int] = [] + for i, token_ids in enumerate(sampled_token_ids): + if token_ids: + # Common case. + next_token_id = token_ids[-1] + else: + # Partial prefill (rare case). + # Get the next token id from the request state. + req_id = req_ids[i] + req_state = requests[req_id] + seq_len = (req_state.num_computed_tokens + + num_scheduled_tokens[req_id]) + next_token_id = req_state.get_token_id(seq_len) + next_token_ids.append(next_token_id) + next_token_ids = torch.tensor(next_token_ids, + dtype=torch.int32, + device=self.input_ids.device) + return next_token_ids + + def prepare_next_token_ids_padded(self, + common_attn_metadata: CommonAttentionMetadata, + sampled_token_ids: torch.Tensor, + requests: dict[str, CachedRequestState], + gpu_input_batch: InputBatch, + discard_request_indices: torch.Tensor, + num_discarded_requests: int) -> \ + tuple[torch.Tensor, torch.Tensor]: + """ + This function is used to prepare the inputs for speculative decoding. + It calculates the next token ids and the number of valid sampled tokens + for each request, considering the "discarded" requests whose next token + is not sampled and comes from `request.get_token_id()` instead. + It also accounts for the rejected tokens in `sampled_token_ids`. + This function must use device functions to operate on the inputs, and + should not introduce any blocking CPU-GPU synchronization. + """ + # TODO(Ben): Combine this into a custom fused kernel + + # Precompute get_token_id for when there is no valid next token + num_reqs = gpu_input_batch.num_reqs + self.backup_next_token_ids.np[:num_reqs] = np.array([ + requests[gpu_input_batch.req_ids[i]].get_token_id( + common_attn_metadata.seq_lens_cpu[i].item()) + for i in range(num_reqs) + ]) + self.backup_next_token_ids.copy_to_gpu(num_reqs) + + # Mask out the sampled tokens indices that should not be sampled. + discard_sampled_tokens_req_indices = \ + discard_request_indices[:num_discarded_requests] + + valid_sampled_token_ids_gpu = sampled_token_ids.clone() + valid_sampled_token_ids_gpu.index_fill_( + 0, discard_sampled_tokens_req_indices, -1) + + # Generate a mask for all valid tokens within those requests + max_gen_len = sampled_token_ids.shape[-1] + if max_gen_len == 1: + valid_mask = torch.ones_like(valid_sampled_token_ids_gpu, + dtype=torch.bool) + else: + valid_mask = ( + (valid_sampled_token_ids_gpu != -1) & + (valid_sampled_token_ids_gpu < gpu_input_batch.vocab_size)) + + # Count the number of valid tokens in each request + valid_sampled_tokens_count = valid_mask.sum(dim=1) + + # Get the rightmost valid index per row + last_valid_indices = valid_sampled_tokens_count - 1 + last_valid_indices_safe = torch.clamp(last_valid_indices, min=0) + + # Get last valid token from each row + # (assume undefined state where there is no valid token) + selected_tokens = torch.gather( + valid_sampled_token_ids_gpu, 1, + last_valid_indices_safe.unsqueeze(1)).squeeze(1) + + # Use last token if valid, pre-computed backup if not + batch_size = valid_sampled_token_ids_gpu.shape[0] + next_token_ids = torch.where( + last_valid_indices != -1, selected_tokens, + self.backup_next_token_ids.gpu[:batch_size]) + + return next_token_ids, valid_sampled_tokens_count + + def prepare_inputs_padded(self, + common_attn_metadata: CommonAttentionMetadata, + spec_decode_metadata: SpecDecodeMetadata, + valid_sampled_tokens_count: torch.Tensor) -> \ + tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]: + """ + This function is used to prepare the inputs for speculative decoding + It updates the common_attn_metadata for speculative decoding, + but does not consider the rejected tokens. Instead, all tokens + are included as inputs to the speculator, with the rejected tokens + used as padding and filtered out later by `token_indices_to_sample`. + No blocking CPU operations should be introduced in this function. + """ + num_draft_tokens_gpu = torch.cat([ + spec_decode_metadata.cu_num_draft_tokens[0:1], + spec_decode_metadata.cu_num_draft_tokens[1:] - + spec_decode_metadata.cu_num_draft_tokens[:-1] + ]) + + num_rejected_tokens_gpu = torch.where( + num_draft_tokens_gpu > 0, + num_draft_tokens_gpu + 1 - valid_sampled_tokens_count, + torch.zeros_like(num_draft_tokens_gpu)) + + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu + + new_query_len_per_req = (query_start_loc_cpu[1:] - + query_start_loc_cpu[:-1]) + + total_num_tokens = query_start_loc_cpu[-1].item() + token_indices = self.arange[:total_num_tokens] + + spec_common_attn_metadata = CommonAttentionMetadata( + query_start_loc=common_attn_metadata.query_start_loc, + seq_lens=common_attn_metadata.seq_lens, + query_start_loc_cpu=query_start_loc_cpu, + seq_lens_cpu=common_attn_metadata.seq_lens_cpu, + num_computed_tokens_cpu=common_attn_metadata. + num_computed_tokens_cpu, + num_reqs=common_attn_metadata.num_reqs, + num_actual_tokens=total_num_tokens, + max_query_len=new_query_len_per_req.max().item(), + max_seq_len=common_attn_metadata.seq_lens_cpu.max().item(), + block_table_tensor=common_attn_metadata.block_table_tensor, + slot_mapping=common_attn_metadata.slot_mapping[token_indices], + causal=True, + ) + + token_indices_to_sample = common_attn_metadata.query_start_loc[1:] - 1 \ + - num_rejected_tokens_gpu + + return spec_common_attn_metadata, token_indices, token_indices_to_sample + def propose_tree( self, batch_size: int, @@ -520,11 +701,11 @@ class EagleProposer: def prepare_inputs( self, common_attn_metadata: CommonAttentionMetadata, - # [batch_size] - num_rejected_tokens: torch.Tensor + sampled_token_ids: list[list[int]], + num_draft_tokens: list[int], ) -> tuple[CommonAttentionMetadata, torch.Tensor]: """ - This function is used to prepare the inputs for the spec decode. + This function is used to prepare the inputs for speculative decoding. It updates to the common_attn_metadata to account for the rejected tokens (and newly sampled tokens). It also returns the token indices of the tokens that should be fed to the speculator. @@ -545,6 +726,13 @@ class EagleProposer: # q1, q1 + 1, ..., q1 + q2 - n2 - 1, # q1 + q2, q1 + q2 + 1, ..., q1 + q2 + q3 - n3 - 1] + num_rejected_tokens = [ + n + 1 - len(sampled_token_ids[i]) if n > 0 else 0 + for i, n in enumerate(num_draft_tokens) + ] + num_rejected_tokens = torch.tensor(num_rejected_tokens, + dtype=torch.int32) + device = common_attn_metadata.query_start_loc.device query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu new_seq_lens_cpu = common_attn_metadata.seq_lens_cpu \ diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 339b9937b73f4..6717622efb801 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -64,7 +64,10 @@ class CachedRequestState: def get_token_id(self, idx: int) -> int: if idx < self.num_prompt_tokens: return self.prompt_token_ids[idx] - return self.output_token_ids[idx - self.num_prompt_tokens] + elif idx - self.num_prompt_tokens < len(self.output_token_ids): + return self.output_token_ids[idx - self.num_prompt_tokens] + else: + return -1 class InputBatch: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f256dc160a6b5..e8ad9c2fca07c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -344,6 +344,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.hidden_size, dtype=self.dtype, numpy=False) + self.discard_request_indices = self._make_buffer(self.max_num_reqs, + dtype=torch.int64) + self.num_discarded_requests = 0 + self.num_draft_tokens = self._make_buffer(self.max_num_reqs, dtype=torch.int32) self.num_accepted_tokens = self._make_buffer(self.max_num_reqs, @@ -974,6 +978,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): seq_lens = self.seq_lens.gpu[:num_reqs] max_seq_len = self.seq_lens.np[:num_reqs].max().item() + num_tokens = [ + self.requests[r].num_tokens for r in self.input_batch.req_ids + ] + num_tokens_np = np.array(num_tokens, dtype=np.int32) + + # Record the index of requests that should not be sampled, + # so that we could clear the sampled tokens before returning + discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np + discard_request_indices = np.nonzero(discard_requests_mask)[0] + self.num_discarded_requests = len(discard_request_indices) + self.discard_request_indices.np[:self.num_discarded_requests] = ( + discard_request_indices) + + self.discard_request_indices.copy_to_gpu(self.num_discarded_requests) + # Copy the tensors to the GPU. self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens) @@ -1973,23 +1992,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if envs.VLLM_COMPUTE_NANS_IN_LOGITS: num_nans_in_logits = self._get_nans_in_logits(logits) - # TODO(woosuk): The following loop can be slow since it iterates over - # the requests one by one. Optimize. - discard_sampled_tokens_req_indices = [] - for i, req_id in enumerate(self.input_batch.req_ids): - req_state = self.requests[req_id] - seq_len = (req_state.num_computed_tokens + - scheduler_output.num_scheduled_tokens[req_id]) - if seq_len < req_state.num_tokens: - # Ignore the sampled token for partial prefills. - # Rewind the generator state as if the token was not sampled. - # This relies on cuda-specific torch-internal impl details - generator = self.input_batch.generators.get(i) - if generator is not None: - generator.set_offset(generator.get_offset() - 4) - # Record the index of the request that should not be sampled, - # so that we could clear the sampled tokens before returning. - discard_sampled_tokens_req_indices.append(i) + discard_sampled_tokens_req_indices = \ + self.discard_request_indices.np[:self.num_discarded_requests] + for i in discard_sampled_tokens_req_indices: + gen = self.input_batch.generators.get(int(i)) + if gen is not None: + gen.set_offset(gen.get_offset() - 4) # Copy some objects so they don't get modified after returning. # This is important when using async scheduling. @@ -2026,10 +2034,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) # Mask out the sampled tokens that should not be sampled. for i in discard_sampled_tokens_req_indices: - valid_sampled_token_ids[i].clear() + valid_sampled_token_ids[int(i)].clear() else: valid_sampled_token_ids = [] - invalid_req_indices = list(discard_sampled_tokens_req_indices) + invalid_req_indices = discard_sampled_tokens_req_indices.tolist() invalid_req_indices_set = set(invalid_req_indices) assert sampled_token_ids.shape[-1] == 1 @@ -2229,6 +2237,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): with record_function_or_nullcontext("Sample"): sampler_output = self._sample(logits, spec_decode_metadata) + def propose_draft_token_ids(sampled_token_ids): + assert spec_decode_common_attn_metadata is not None + with record_function_or_nullcontext("Draft"): + self._draft_token_ids = self.propose_draft_token_ids( + scheduler_output, + sampled_token_ids, + self.input_batch.sampling_metadata, + hidden_states, + sample_hidden_states, + aux_hidden_states, + spec_decode_metadata, + spec_decode_common_attn_metadata, + ) + + use_padded_batch_for_eagle = self.speculative_config and \ + self.speculative_config.use_eagle() and \ + not self.speculative_config.disable_padded_drafter_batch + if use_padded_batch_for_eagle: + # EAGLE speculative decoding can use the GPU sampled tokens + # as inputs, and does not need to wait for bookkeeping to finish. + propose_draft_token_ids(sampler_output.sampled_token_ids) + with record_function_or_nullcontext("Bookkeep"): ( num_nans_in_logits, @@ -2242,19 +2272,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits, hidden_states, num_scheduled_tokens) - if self.speculative_config: - assert spec_decode_common_attn_metadata is not None - with record_function_or_nullcontext("Draft"): - self._draft_token_ids = self.propose_draft_token_ids( - scheduler_output, - valid_sampled_token_ids, - self.input_batch.sampling_metadata, - hidden_states, - sample_hidden_states, - aux_hidden_states, - spec_decode_metadata, - spec_decode_common_attn_metadata, - ) + if self.speculative_config and not use_padded_batch_for_eagle: + # ngram and other speculative decoding methods use the sampled + # tokens on the CPU, so they are run after bookkeeping. + propose_draft_token_ids(valid_sampled_token_ids) with record_function_or_nullcontext("EPLB"): self.eplb_step() @@ -2294,7 +2315,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def propose_draft_token_ids( self, scheduler_output: "SchedulerOutput", - sampled_token_ids: list[list[int]], + sampled_token_ids: Union[torch.Tensor, list[list[int]]], sampling_metadata: SamplingMetadata, hidden_states: torch.Tensor, sample_hidden_states: torch.Tensor, @@ -2304,11 +2325,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) -> Union[list[list[int]], torch.Tensor]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if self.speculative_config.method == "ngram": + assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, NgramProposer) draft_token_ids = self.propose_ngram_draft_token_ids( sampled_token_ids) elif self.speculative_config.method == "medusa": + assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, MedusaProposer) + if sample_hidden_states.shape[0] == len(sampled_token_ids): # The input to the target model does not include draft tokens. hidden_states = sample_hidden_states @@ -2329,27 +2353,37 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) elif self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - # TODO(woosuk): Refactor the loop. - req_ids = self.input_batch.req_ids - next_token_ids: list[int] = [] - for i, token_ids in enumerate(sampled_token_ids): - if token_ids: - # Common case. - next_token_id = token_ids[-1] - else: - # Partial prefill (rare case). - # Get the next token id from the request state. - req_id = req_ids[i] - req_state = self.requests[req_id] - seq_len = (req_state.num_computed_tokens + - scheduler_output.num_scheduled_tokens[req_id]) - next_token_id = req_state.get_token_id(seq_len) - next_token_ids.append(next_token_id) - next_token_ids = torch.tensor(next_token_ids, - dtype=torch.int32, - device=self.device) + + if self.speculative_config.disable_padded_drafter_batch: + # When padded-batch is disabled, the sampled_token_ids should be + # the cpu-side list[list[int]] of valid sampled tokens for each + # request, with invalid requests having empty lists. + assert isinstance(sampled_token_ids, list), \ + "sampled_token_ids should be a python list when" \ + "padded-batch is disabled." + next_token_ids = self.drafter.prepare_next_token_ids_cpu( + sampled_token_ids, self.requests, self.input_batch, + scheduler_output.num_scheduled_tokens) + else: + # When using padded-batch, the sampled_token_ids should be + # the gpu tensor of sampled tokens for each request, of shape + # (num_reqs, num_spec_tokens + 1) with rejected tokens having + # value -1. + assert isinstance(sampled_token_ids, torch.Tensor), \ + "sampled_token_ids should be a torch.Tensor when" \ + "padded-batch is enabled." + next_token_ids, valid_sampled_tokens_count = \ + self.drafter.prepare_next_token_ids_padded( + common_attn_metadata, + sampled_token_ids, + self.requests, + self.input_batch, + self.discard_request_indices.gpu, + self.num_discarded_requests + ) if spec_decode_metadata is None: + token_indices_to_sample = None # input_ids can be None for multimodal models. target_token_ids = self.input_ids.gpu[:num_scheduled_tokens] # TODO(woosuk): Support M-RoPE. @@ -2361,17 +2395,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): else: target_hidden_states = hidden_states[:num_scheduled_tokens] else: - # TODO(woosuk): Refactor this. - num_draft_tokens = spec_decode_metadata.num_draft_tokens - num_rejected_tokens = [ - n + 1 - len(sampled_token_ids[i]) if n > 0 else 0 - for i, n in enumerate(num_draft_tokens) - ] - num_rejected_tokens_cpu = torch.tensor(num_rejected_tokens, - dtype=torch.int32) - common_attn_metadata, token_indices =\ - self.drafter.prepare_inputs( - common_attn_metadata, num_rejected_tokens_cpu) + if self.speculative_config.disable_padded_drafter_batch: + token_indices_to_sample = None + common_attn_metadata, token_indices =\ + self.drafter.prepare_inputs( + common_attn_metadata, + sampled_token_ids, + spec_decode_metadata.num_draft_tokens) + else: + common_attn_metadata, token_indices, \ + token_indices_to_sample =\ + self.drafter.prepare_inputs_padded( + common_attn_metadata, + spec_decode_metadata, + valid_sampled_tokens_count) target_token_ids = self.input_ids.gpu[token_indices] # TODO(woosuk): Support M-RoPE. @@ -2391,6 +2428,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): target_positions=target_positions, target_hidden_states=target_hidden_states, next_token_ids=next_token_ids, + last_token_indices=token_indices_to_sample, sampling_metadata=sampling_metadata, common_attn_metadata=common_attn_metadata, mm_embeds=mm_embeds, From a904ea78eaf7fc3f9b136a1ba6f6f66fb5658496 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 17 Sep 2025 22:30:02 -0700 Subject: [PATCH 12/58] [benchmark] add peak throughput metrics and plot (#23867) Signed-off-by: simon-mo --- vllm/benchmarks/lib/endpoint_request_func.py | 5 + vllm/benchmarks/serve.py | 198 ++++++++++++------- 2 files changed, 134 insertions(+), 69 deletions(-) diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index e640630476630..066b8fe834380 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -89,6 +89,7 @@ class RequestFuncOutput: tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" + start_time: float = 0.0 async def async_request_openai_completions( @@ -140,6 +141,7 @@ async def async_request_openai_completions( generated_text = "" st = time.perf_counter() + output.start_time = st most_recent_timestamp = st try: async with session.post(url=api_url, json=payload, @@ -272,6 +274,7 @@ async def async_request_openai_chat_completions( generated_text = "" ttft = 0.0 st = time.perf_counter() + output.start_time = st most_recent_timestamp = st try: async with session.post(url=api_url, json=payload, @@ -396,6 +399,7 @@ async def async_request_openai_audio( generated_text = "" ttft = 0.0 st = time.perf_counter() + output.start_time = st most_recent_timestamp = st try: async with session.post(url=api_url, @@ -475,6 +479,7 @@ async def async_request_openai_embeddings( output = RequestFuncOutput() st = time.perf_counter() + output.start_time = st try: async with session.post( url=api_url, diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 1aeef0fd5bd85..d8784340eba15 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -18,9 +18,11 @@ On the client side, run: import argparse import asyncio import gc +import importlib.util import json import os import random +import shutil import time import warnings from collections.abc import AsyncGenerator, Iterable @@ -46,6 +48,9 @@ from vllm.transformers_utils.tokenizer import get_tokenizer MILLISECONDS_TO_SECONDS_CONVERSION = 1000 +TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None) + and (shutil.which("gnuplot") is not None)) + class TaskType(Enum): GENERATION = "generation" @@ -80,18 +85,23 @@ class BenchmarkMetrics: median_e2el_ms: float std_e2el_ms: float percentiles_e2el_ms: list[tuple[float, float]] + # Max output tokens per second and concurrent requests at that peak + max_output_tokens_per_s: float + max_concurrent_requests: int + @dataclass class EmbedBenchmarkMetrics: completed: int total_input: int request_throughput: float - total_token_throughput :float + total_token_throughput: float mean_e2el_ms: float std_e2el_ms: float median_e2el_ms: float percentiles_e2el_ms: float + def _get_current_request_rate( ramp_up_strategy: Optional[Literal["linear", "exponential"]], ramp_up_start_rps: Optional[int], @@ -150,8 +160,8 @@ async def get_request( assert burstiness > 0, ( f"A positive burstiness factor is expected, but given {burstiness}.") # Convert to list to get length for ramp-up calculations - if isinstance(input_requests, Iterable) and not isinstance( - input_requests, list): + if isinstance(input_requests, + Iterable) and not isinstance(input_requests, list): input_requests = list(input_requests) total_requests = len(input_requests) @@ -161,12 +171,9 @@ async def get_request( request_rates = [] delay_ts = [] for request_index, request in enumerate(input_requests): - current_request_rate = _get_current_request_rate(ramp_up_strategy, - ramp_up_start_rps, - ramp_up_end_rps, - request_index, - total_requests, - request_rate) + current_request_rate = _get_current_request_rate( + ramp_up_strategy, ramp_up_start_rps, ramp_up_end_rps, + request_index, total_requests, request_rate) request_rates.append(current_request_rate) if current_request_rate == float("inf"): delay_ts.append(0) @@ -206,10 +213,8 @@ async def get_request( def calculate_metrics_for_embeddings( - outputs: list[RequestFuncOutput], - dur_s: float, - selected_percentiles: list[float] -) -> EmbedBenchmarkMetrics: + outputs: list[RequestFuncOutput], dur_s: float, + selected_percentiles: list[float]) -> EmbedBenchmarkMetrics: """Calculate the metrics for the embedding requests. Args: @@ -242,10 +247,8 @@ def calculate_metrics_for_embeddings( mean_e2el_ms=np.mean(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000, median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[ - (p, np.percentile(e2els or 0, p) * 1000) - for p in selected_percentiles - ], + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], ) return metrics @@ -336,6 +339,67 @@ def calculate_metrics( "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.", stacklevel=2) + + # Calculate max output tokens per second metric + max_output_tokens_per_s = 0.0 + max_concurrent_requests = 0 + + # Find the time range across all successful requests + successful_outputs = [output for output in outputs if output.success] + if successful_outputs: + min_start_time = min(output.start_time + for output in successful_outputs) + max_end_time = max(output.start_time + output.latency + for output in successful_outputs) + + # Create second buckets (ceiling to ensure we capture all time) + duration_seconds = int(np.ceil(max_end_time - min_start_time)) + 1 + tokens_per_second = np.zeros(duration_seconds) + concurrent_requests_per_second = np.zeros(duration_seconds) + + for i, output in enumerate(successful_outputs): + # Calculate token generation timestamp using + # start_time, ttft, and itl + token_times = [output.start_time + output.ttft] + current_time = token_times[0] + for itl_value in output.itl: + current_time += itl_value + token_times.append(current_time) + + # Add tokens to second buckets + for token_time in token_times: + second_bucket = int(token_time - min_start_time) + if 0 <= second_bucket < duration_seconds: + tokens_per_second[second_bucket] += 1 + + # Track concurrent requests for each second this request was active + request_start_second = int(output.start_time - min_start_time) + request_end_second = int((output.start_time + output.latency) - + min_start_time) + + for second in range(request_start_second, request_end_second + 1): + concurrent_requests_per_second[second] += 1 + + # Find the maximum tokens per second and corresponding + # concurrent requests + if len(tokens_per_second) > 0: + max_output_tokens_per_s = float(np.max(tokens_per_second)) + max_concurrent_requests = int( + np.max(concurrent_requests_per_second)) + + if TERM_PLOTLIB_AVAILABLE: + import termplotlib as tpl + fig = tpl.figure() + fig.plot(np.arange(len(tokens_per_second)), + tokens_per_second, + title="Output tokens per second") + fig.plot(np.arange(len(concurrent_requests_per_second)), + concurrent_requests_per_second, + title="Concurrent requests per second") + fig.show() + else: + print("tip: install termplotlib and gnuplot to plot the metrics") + metrics = BenchmarkMetrics( completed=completed, total_input=total_input, @@ -365,6 +429,8 @@ def calculate_metrics( median_e2el_ms=np.median(e2els or 0) * 1000, percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], + max_output_tokens_per_s=max_output_tokens_per_s, + max_concurrent_requests=max_concurrent_requests, ) return metrics, actual_output_lens @@ -396,11 +462,8 @@ async def benchmark( ramp_up_end_rps: Optional[int] = None, ready_check_timeout_sec: int = 600, ): - task_type = ( - TaskType.EMBEDDING - if api_url.endswith("/v1/embeddings") - else TaskType.GENERATION - ) + task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else + TaskType.GENERATION) if endpoint_type in ASYNC_REQUEST_FUNCS: if task_type == TaskType.EMBEDDING: request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"] @@ -435,14 +498,10 @@ async def benchmark( input_requests[0].multi_modal_data, ) - assert ( - test_mm_content is None - or isinstance(test_mm_content, dict) - or ( - isinstance(test_mm_content, list) - and all(isinstance(item, dict) for item in test_mm_content) - ) - ), "multi_modal_data must be a dict or list[dict]" + assert (test_mm_content is None or isinstance(test_mm_content, dict) + or (isinstance(test_mm_content, list) + and all(isinstance(item, dict) for item in test_mm_content)) + ), "multi_modal_data must be a dict or list[dict]" test_input = RequestFuncInput( model=model_id, model_name=model_name, @@ -488,13 +547,13 @@ async def benchmark( ignore_eos=ignore_eos, extra_headers=extra_headers, extra_body=extra_body) - profile_output = await request_func( - request_func_input=profile_input, session=session) + profile_output = await request_func(request_func_input=profile_input, + session=session) if profile_output.success: print("Profiler started") - distribution = ("Poisson process" if burstiness == 1.0 - else "Gamma distribution") + distribution = ("Poisson process" + if burstiness == 1.0 else "Gamma distribution") if ramp_up_strategy is not None: print(f"Traffic ramp-up strategy: {ramp_up_strategy}.") @@ -562,18 +621,20 @@ async def benchmark( req_lora_module = next(lora_modules) req_model_id, req_model_name = req_lora_module, req_lora_module - request_func_input = RequestFuncInput(model=req_model_id, - model_name=req_model_name, - prompt=prompt, - api_url=api_url, - prompt_len=prompt_len, - output_len=output_len, - logprobs=logprobs, - multi_modal_content=mm_content, - ignore_eos=ignore_eos, - extra_headers=extra_headers, - extra_body=extra_body, - request_id=request_id,) + request_func_input = RequestFuncInput( + model=req_model_id, + model_name=req_model_name, + prompt=prompt, + api_url=api_url, + prompt_len=prompt_len, + output_len=output_len, + logprobs=logprobs, + multi_modal_content=mm_content, + ignore_eos=ignore_eos, + extra_headers=extra_headers, + extra_body=extra_body, + request_id=request_id, + ) tasks.append( asyncio.create_task( limited_request_func(request_func_input=request_func_input, @@ -615,19 +676,21 @@ async def benchmark( benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) if isinstance(metrics, BenchmarkMetrics): - print("{:<40} {:<10}".format( - "Total generated tokens:", metrics.total_output)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) if goodput_config_dict: print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) if isinstance(metrics, BenchmarkMetrics): - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", metrics.output_throughput - ) - ) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{:<40} {:<10.2f}".format( + "Peak output token throughput (tok/s):", + metrics.max_output_tokens_per_s)) + print("{:<40} {:<10.2f}".format("Peak concurrent requests:", + metrics.max_concurrent_requests)) print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput)) @@ -648,6 +711,8 @@ async def benchmark( "itls": [output.itl for output in outputs], "generated_texts": [output.generated_text for output in outputs], "errors": [output.error for output in outputs], + "max_output_tokens_per_s": metrics.max_output_tokens_per_s, + "max_concurrent_requests": metrics.max_concurrent_requests, } else: result = { @@ -697,8 +762,8 @@ async def benchmark( if task_type == TaskType.GENERATION: process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric( - "tpot", "TPOT", "Time per Output Token (excl. 1st token)") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") process_one_metric("itl", "ITL", "Inter-token Latency") process_one_metric("e2el", "E2EL", "End-to-end Latency") @@ -714,8 +779,8 @@ async def benchmark( output_len=test_output_len, logprobs=logprobs, ) - profile_output = await request_func( - request_func_input=profile_input, session=session) + profile_output = await request_func(request_func_input=profile_input, + session=session) if profile_output.success: print("Profiler stopped") @@ -851,7 +916,8 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--tokenizer", type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help= + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( @@ -982,7 +1048,6 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Specify the prefix of request id.", ) - sampling_group = parser.add_argument_group("sampling parameters") sampling_group.add_argument( "--top-p", @@ -1047,8 +1112,7 @@ def add_cli_args(parser: argparse.ArgumentParser): help="The ramp-up strategy. This would be used to " "ramp up the request rate from initial RPS to final " "RPS rate (specified by --ramp-up-start-rps and " - "--ramp-up-end-rps.) over the duration of the benchmark." - ) + "--ramp-up-end-rps.) over the duration of the benchmark.") parser.add_argument( "--ramp-up-start-rps", type=int, @@ -1087,13 +1151,11 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: raise ValueError( "When using ramp-up, do not specify --request-rate. " "The request rate will be controlled by ramp-up parameters. " - "Please remove the --request-rate argument." - ) + "Please remove the --request-rate argument.") if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None: raise ValueError( "When using --ramp-up-strategy, both --ramp-up-start-rps and " - "--ramp-up-end-rps must be specified" - ) + "--ramp-up-end-rps must be specified") if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0: raise ValueError("Ramp-up start and end RPS must be non-negative") if args.ramp_up_start_rps > args.ramp_up_end_rps: @@ -1127,8 +1189,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: headers[kvstring[0].strip()] = kvstring[1].strip() else: raise ValueError( - "Invalid header format. Please use KEY=VALUE format." - ) + "Invalid header format. Please use KEY=VALUE format.") tokenizer = get_tokenizer(tokenizer_id, tokenizer_mode=tokenizer_mode, @@ -1215,8 +1276,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: result_json[kvstring[0].strip()] = kvstring[1].strip() else: raise ValueError( - "Invalid metadata format. Please use KEY=VALUE format." - ) + "Invalid metadata format. Please use KEY=VALUE format.") # Traffic result_json["request_rate"] = (args.request_rate if args.request_rate From e111d5b0ae9359e2a829771105e739d36505fa69 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 17 Sep 2025 22:30:26 -0700 Subject: [PATCH 13/58] [CLI] Use streaming in CLI chat and completion commands (#23769) Signed-off-by: simon-mo --- vllm/entrypoints/cli/openai.py | 71 +++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 26 deletions(-) diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index 7c01de94a3436..1929d6a7f77af 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -45,6 +45,28 @@ def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]: return model_name, openai_client +def _print_chat_stream(stream) -> str: + output = "" + for chunk in stream: + delta = chunk.choices[0].delta + if delta.content: + output += delta.content + print(delta.content, end="", flush=True) + print() + return output + + +def _print_completion_stream(stream) -> str: + output = "" + for chunk in stream: + text = chunk.choices[0].text + if text is not None: + output += text + print(text, end="", flush=True) + print() + return output + + def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None: conversation: list[ChatCompletionMessageParam] = [] if system_prompt is not None: @@ -58,14 +80,11 @@ def chat(system_prompt: str | None, model_name: str, client: OpenAI) -> None: break conversation.append({"role": "user", "content": input_message}) - chat_completion = client.chat.completions.create(model=model_name, - messages=conversation) - - response_message = chat_completion.choices[0].message - output = response_message.content - - conversation.append(response_message) # type: ignore - print(output) + stream = client.chat.completions.create(model=model_name, + messages=conversation, + stream=True) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) def _add_query_options( @@ -108,9 +127,11 @@ class ChatCommand(CLISubcommand): if args.quick: conversation.append({"role": "user", "content": args.quick}) - chat_completion = client.chat.completions.create( - model=model_name, messages=conversation) - print(chat_completion.choices[0].message.content) + stream = client.chat.completions.create(model=model_name, + messages=conversation, + stream=True) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) return print("Please enter a message for the chat model:") @@ -121,14 +142,11 @@ class ChatCommand(CLISubcommand): break conversation.append({"role": "user", "content": input_message}) - chat_completion = client.chat.completions.create( - model=model_name, messages=conversation) - - response_message = chat_completion.choices[0].message - output = response_message.content - - conversation.append(response_message) # type: ignore - print(output) + stream = client.chat.completions.create(model=model_name, + messages=conversation, + stream=True) + output = _print_chat_stream(stream) + conversation.append({"role": "assistant", "content": output}) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: @@ -168,9 +186,10 @@ class CompleteCommand(CLISubcommand): model_name, client = _interactive_cli(args) if args.quick: - completion = client.completions.create(model=model_name, - prompt=args.quick) - print(completion.choices[0].text) + stream = client.completions.create(model=model_name, + prompt=args.quick, + stream=True) + _print_completion_stream(stream) return print("Please enter prompt to complete:") @@ -179,10 +198,10 @@ class CompleteCommand(CLISubcommand): input_prompt = input("> ") except EOFError: break - completion = client.completions.create(model=model_name, - prompt=input_prompt) - output = completion.choices[0].text - print(output) + stream = client.completions.create(model=model_name, + prompt=input_prompt, + stream=True) + _print_completion_stream(stream) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: From 81b16a2bc922e837267db7216a274c4d89a2cc0c Mon Sep 17 00:00:00 2001 From: Lumina Date: Thu, 18 Sep 2025 13:53:55 +0800 Subject: [PATCH 14/58] [Kernel] Better inf handling for grouped topk cu (#24886) Signed-off-by: lumina37 --- csrc/moe/grouped_topk_kernels.cu | 44 +++++++++++++++++--------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu index accbb09858fac..b5321f748e6be 100644 --- a/csrc/moe/grouped_topk_kernels.cu +++ b/csrc/moe/grouped_topk_kernels.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include namespace cg = cooperative_groups; @@ -28,7 +29,6 @@ namespace cg = cooperative_groups; namespace vllm { namespace moe { -constexpr float kNegInfinity = INFINITY * -1; constexpr unsigned FULL_WARP_MASK = 0xffffffff; constexpr int32_t WARP_SIZE = 32; constexpr int32_t BLOCK_SIZE = 512; @@ -411,14 +411,21 @@ __device__ inline float cuda_cast(__nv_bfloat16 val) { return __bfloat162float(val); } +template +__device__ inline T neg_inf() { + // cuda::std::numeric_limits::infinity() returns `0` for [T=bf16 or fp16] + // so we need to cast from fp32 + return cuda_cast(-cuda::std::numeric_limits::infinity()); +} + template __device__ void topk_with_k2(T* output, T const* input, cg::thread_block_tile<32> const& tile, int32_t const lane_id, int const num_experts_per_group) { // Get the top2 per thread - T largest = -INFINITY; - T second_largest = -INFINITY; + T largest = neg_inf(); + T second_largest = neg_inf(); if (num_experts_per_group > WARP_SIZE) { for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { @@ -513,8 +520,8 @@ __global__ void group_idx_and_topk_idx_kernel( warp_id * topk; s_topk_idx += warp_id * topk; - T value = kNegInfinity; - T topk_group_value = kNegInfinity; + T value = neg_inf(); + T topk_group_value = neg_inf(); int32_t num_equalto_topkth_group; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) @@ -525,11 +532,8 @@ __global__ void group_idx_and_topk_idx_kernel( if (case_id < num_tokens) { // calculate group_idx int32_t target_num_min = WARP_SIZE - n_group + topk_group; - if (lane_id < n_group && - (isfinite(cuda_cast( - group_scores[lane_id])))) // The check is necessary to avoid - // abnormal input - { + // The check is necessary to avoid abnormal input + if (lane_id < n_group && cuda::std::isfinite(group_scores[lane_id])) { value = group_scores[lane_id]; } @@ -540,11 +544,11 @@ __global__ void group_idx_and_topk_idx_kernel( __syncwarp(); // Ensure all threads have valid data before reduction topk_group_value = cg::reduce(tile, value, cg::greater()); if (value == topk_group_value) { - value = kNegInfinity; + value = neg_inf(); } pre_count_equal_to_top_value = count_equal_to_top_value; - count_equal_to_top_value = __popc(__ballot_sync( - FULL_WARP_MASK, (value == cuda_cast(kNegInfinity)))); + count_equal_to_top_value = + __popc(__ballot_sync(FULL_WARP_MASK, (value == neg_inf()))); } num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value; } @@ -552,11 +556,10 @@ __global__ void group_idx_and_topk_idx_kernel( warp_topk::WarpSelect - queue((int32_t)topk, -INFINITY); + queue((int32_t)topk, neg_inf()); int count_equalto_topkth_group = 0; - bool if_proceed_next_topk = - (topk_group_value != cuda_cast(kNegInfinity)); + bool if_proceed_next_topk = topk_group_value != neg_inf(); if (case_id < num_tokens && if_proceed_next_topk) { for (int i_group = 0; i_group < n_group; i_group++) { if ((group_scores[i_group] > topk_group_value) || @@ -566,10 +569,10 @@ __global__ void group_idx_and_topk_idx_kernel( for (int32_t i = lane_id; i < align_num_experts_per_group; i += WARP_SIZE) { T candidates = - (i < num_experts_per_group) && isfinite(cuda_cast( - scores_with_bias[offset + i])) + (i < num_experts_per_group) && + cuda::std::isfinite(scores_with_bias[offset + i]) ? scores_with_bias[offset + i] - : cuda_cast(kNegInfinity); + : neg_inf(); queue.add(candidates, offset + i); } if (group_scores[i_group] == topk_group_value) { @@ -598,7 +601,8 @@ __global__ void group_idx_and_topk_idx_kernel( if (i < topk) { s_topk_value[i] = value; } - topk_sum += reduce(tile, cuda_cast(value), cg::plus()); + topk_sum += + cg::reduce(tile, cuda_cast(value), cg::plus()); } } From 349e0e34627950db1cc4be0df9a0bc616e210589 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 07:23:29 +0100 Subject: [PATCH 15/58] [Docs] Fix API Reference (#25140) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- mkdocs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mkdocs.yaml b/mkdocs.yaml index bbd850bdfee34..6f2be65a18af8 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -79,7 +79,7 @@ plugins: - "re:vllm\\._.*" # Internal modules - "vllm.third_party" - "vllm.vllm_flash_attn" - - !ENV [API_AUTONAV_EXCLUDE, ""] + - !ENV [API_AUTONAV_EXCLUDE, "re:^$"] # Match nothing by default - mkdocstrings: handlers: python: From f4cd80f94404787859ba72dcddb5e818d8f0c9e7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 07:29:05 +0100 Subject: [PATCH 16/58] Retrieve `sliding_window` from text config in Gemma3 MM (#25085) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/gemma3_mm.py | 3 ++- vllm/model_executor/models/gemma3n_mm.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index e652ba2f1c7fe..bee9fbd2c084a 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -688,7 +688,8 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask) global_attn_masks.append(global_attn_mask) - if (sliding_window := self.config.sliding_window) is not None: + sliding_window = self.config.text_config.sliding_window + if sliding_window is not None: # Create a local causal mask with sliding window (1024). local_attn_mask = torch.ones_like(global_attn_mask) local_attn_mask = torch.tril(local_attn_mask, diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 663d4da7cec23..8d3079aee0dfb 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -461,9 +461,6 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal, self.multimodal_config = multimodal_config self.vocab_size = config.text_config.vocab_size - self.sliding_window = getattr(config.text_config, - "interleaved_sliding_window", None) - self.vision_tower = AutoModel.from_config(config=config.vision_config) self.audio_tower = AutoModel.from_config(config=config.audio_config) self.embed_vision = Gemma3nMultimodalEmbedder(config.vision_config, From 350c94deb30747f84536ee34d91c6fca564667ce Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Thu, 18 Sep 2025 15:47:43 +0800 Subject: [PATCH 17/58] [Bugfix] when use s3 model cannot use default load_format (#24435) Signed-off-by: rongfu.leng Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- vllm/config/__init__.py | 12 ++++++++++++ vllm/engine/arg_utils.py | 1 - 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 64be2f38c6a31..631618d427d42 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3029,6 +3029,18 @@ class VllmConfig: SequenceClassificationConfig) SequenceClassificationConfig.verify_and_update_config(self) + if hasattr(self.model_config, "model_weights") and is_runai_obj_uri( + self.model_config.model_weights): + if self.load_config.load_format == "auto": + logger.info("Detected Run:ai model config. " + "Overriding `load_format` to 'runai_streamer'") + self.load_config.load_format = "runai_streamer" + elif self.load_config.load_format != "runai_streamer": + raise ValueError(f"To load a model from S3, 'load_format' " + f"must be 'runai_streamer', " + f"but got '{self.load_config.load_format}'. " + f"Model: {self.model_config.model}") + def __str__(self): return ( f"model={self.model_config.model!r}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4831cb5348c77..e2a1ec68e6f53 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -959,7 +959,6 @@ class EngineArgs: if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3 and self.model in MODELS_ON_S3 and self.load_format == "auto"): self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}" - self.load_format = "runai_streamer" if self.disable_mm_preprocessor_cache: logger.warning( From ef7eefe17a7dc212ddb8a8aabd7760218a10e25e Mon Sep 17 00:00:00 2001 From: Tao He Date: Thu, 18 Sep 2025 16:16:04 +0800 Subject: [PATCH 18/58] [Qwen] Add fp8 checkpoint support for qwen3-next. (#25079) Signed-off-by: Tao He --- vllm/model_executor/models/qwen3_next.py | 35 ++++++++++---------- vllm/model_executor/models/qwen3_next_mtp.py | 8 +++-- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index ca9f4d402dac2..eb060cb90f44c 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -30,7 +30,6 @@ from vllm.model_executor.layers.layernorm import ( GemmaRMSNorm as Qwen3NextRMSNorm) # yapf: enable from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, RowParallelLinear) @@ -254,12 +253,20 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): # projection of the input hidden states self.projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2 self.projection_size_ba = self.num_v_heads * 2 - self.in_proj = MergedColumnParallelLinear( + self.in_proj_qkvz = ColumnParallelLinear( input_size=self.hidden_size, - output_sizes=[self.projection_size_qkvz, self.projection_size_ba], + output_size=self.projection_size_qkvz, bias=False, quant_config=quant_config, - prefix=f"{prefix}.in_proj", + prefix=f"{prefix}.in_proj_qkvz", + ) + # ba_proj doesn't support blockwise fp8 quantization. + self.in_proj_ba = ColumnParallelLinear( + input_size=self.hidden_size, + output_size=self.projection_size_ba, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.in_proj_ba", ) query_key_settings = (self.key_dim, 0, False) @@ -420,19 +427,14 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): ssm_state = self_kv_cache[1] num_actual_tokens = attn_metadata.num_actual_tokens num_accepted_tokens = attn_metadata.num_accepted_tokens - - # 1. Set up dimensions for reshapes later - projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens]) if spec_token_masks is not None: spec_token_masks = spec_token_masks[:num_actual_tokens] - projected_states_qkvz, projected_states_ba = torch.split( - projected_states, - [ - self.projection_size_qkvz // self.tp_size, - self.projection_size_ba // self.tp_size - ], - dim=-1, - ) + + # 1. Set up dimensions for reshapes later + projected_states_qkvz, _ = self.in_proj_qkvz( + hidden_states[:num_actual_tokens]) + projected_states_ba, _ = self.in_proj_ba( + hidden_states[:num_actual_tokens]) query, key, value, z, b, a = self.fix_query_key_value_ordering( projected_states_qkvz, projected_states_ba) query, key, value = map(lambda x: rearrange(x, 'l p d -> l (p d)'), @@ -976,8 +978,6 @@ class Qwen3NextModel(nn.Module): ("qkv_proj", "v_proj", "v"), ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), - ("in_proj", "in_proj_qkvz", 0), - ("in_proj", "in_proj_ba", 1), ] params_dict = dict(self.named_parameters()) @@ -1055,7 +1055,6 @@ class Qwen3NextForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, "v_proj", ], "gate_up_proj": ["gate_proj", "up_proj"], - "in_proj": ["in_proj_qkvz", "in_proj_ba"], } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py index 190a1750e673a..c755eeb9b4eaa 100644 --- a/vllm/model_executor/models/qwen3_next_mtp.py +++ b/vllm/model_executor/models/qwen3_next_mtp.py @@ -63,7 +63,9 @@ class Qwen3NextMultiTokenPredictor(nn.Module): self.config.hidden_size, gather_output=True, bias=False, - return_bias=False) + return_bias=False, + quant_config=quant_config, + prefix=f'{prefix}.fc') self.layers = torch.nn.ModuleList( Qwen3NextDecoderLayer( @@ -72,7 +74,7 @@ class Qwen3NextMultiTokenPredictor(nn.Module): model_config=model_config, cache_config=cache_config, quant_config=quant_config, - prefix=f'{prefix}.layers.{self.mtp_start_layer_idx + idx}', + prefix=f'{prefix}.layers.{idx}', ) for idx in range(self.num_mtp_layers)) self.make_empty_intermediate_tensors = ( @@ -233,7 +235,7 @@ class Qwen3NextMTP(nn.Module, SupportsPP): self.config = config self.model = Qwen3NextMultiTokenPredictor(vllm_config=vllm_config, prefix=maybe_prefix( - prefix, "model")) + prefix, "mtp")) self.unpadded_vocab_size = config.vocab_size self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, From aa3f105c591a506523804e12800adcca80480bd8 Mon Sep 17 00:00:00 2001 From: Gerard Finol Date: Thu, 18 Sep 2025 11:02:14 +0200 Subject: [PATCH 19/58] Add 'path' option to ImagePrompt data_format (#25081) Signed-off-by: Gerard Finol --- .../prithvi_io_processor_plugin/prithvi_io_processor/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py index d480aef704c61..d4c6628211fb2 100644 --- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py +++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py @@ -22,7 +22,7 @@ class DataModuleConfig(TypedDict): class ImagePrompt(BaseModel): - data_format: Literal["b64_json", "bytes", "url"] + data_format: Literal["b64_json", "bytes", "url", "path"] """ This is the data type for the input image """ From 05b044e698bb3c151871d94b64fabd87188de9ef Mon Sep 17 00:00:00 2001 From: Punitvara Date: Thu, 18 Sep 2025 14:35:16 +0530 Subject: [PATCH 20/58] [Doc] Fix cross-reference warnings (#25058) Signed-off-by: Punit Vara Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/benchmarks/datasets.py | 3 ++- .../device_communicators/shm_object_storage.py | 8 ++++---- .../model_executor/layers/mamba/ops/causal_conv1d.py | 12 +++++++----- vllm/model_executor/models/mistral3.py | 2 +- vllm/multimodal/profiling.py | 2 +- vllm/v1/core/kv_cache_manager.py | 5 +++-- 6 files changed, 18 insertions(+), 14 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 1831539a6adbe..1cab40802c392 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -171,7 +171,8 @@ class BenchmarkDataset(ABC): If `None`, LoRA is not used. Returns: - A new [LoRARequest][] (or `None` if not applicable). + A new [`LoRARequest`][vllm.lora.request.LoRARequest] + (or `None` if not applicable). """ if max_loras is None or lora_path is None: return None diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py index 3fac104bda1e8..352e7525d4c84 100644 --- a/vllm/distributed/device_communicators/shm_object_storage.py +++ b/vllm/distributed/device_communicators/shm_object_storage.py @@ -30,7 +30,7 @@ class SingleWriterShmRingBuffer: - Maintains metadata for each allocated buffer chunk in the writer process - Supports custom "is_free_fn" functions to determine when buffers can be reused - - Each buffer chunk contains: [4-byte id][4-byte size][actual_data] + - Each buffer chunk contains: `[4-byte id][4-byte size][actual_data]` Key Concepts: - monotonic_id_start/end: Track the range of active buffer IDs @@ -99,7 +99,7 @@ class SingleWriterShmRingBuffer: - Writer handles garbage collection (free_buf) based on reader feedback Memory Layout per Buffer Chunk: - [4-byte monotonic_id][4-byte chunk_size][actual_data...] + `[4-byte monotonic_id][4-byte chunk_size][actual_data...]` ^metadata_start ^data_start The monotonic_id ensures data integrity - readers can verify they're @@ -185,7 +185,7 @@ class SingleWriterShmRingBuffer: ''' Allocate a buffer `MD_SIZE` + `size` bytes in the shared memory. Memory layout: - [4-byte monotonic_id][4-byte size][buffer data...] + `[4-byte monotonic_id][4-byte size][buffer data...]` ''' assert self.is_writer, "Only the writer can allocate buffers." assert size > 0, "Size must be greater than 0" @@ -413,7 +413,7 @@ class SingleWriterShmObjectStorage: allocation Memory Layout per Object: - [4-byte reference_count][metadata_size][serialized_object_data] + `[4-byte reference_count][metadata_size][serialized_object_data]` Thread Safety: - Writer operations (put, clear) are single-threaded by design diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index 7e3ea561fd293..2a88fa661da01 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -927,11 +927,13 @@ def causal_conv1d_update( validate_data=False, ): """ - x: (batch, dim) or (batch, dim, seqlen) or (num_tokens, dim) - [shape=2: single token prediction] - [shape=3: single or multiple tokens prediction] - [shape=2 with num_tokens: continuous batching, where num_tokens is the - total tokens of all sequences in that batch] + x: Input tensor which can take the following shapes: + + - `[batch, dim]` - single token prediction + - `[batch, dim, seqlen]` - single or multiple tokens prediction + - `[num_tokens, dim]` - continuous batching, where num_tokens is + the total tokens of all sequences in that batch + conv_state: (..., dim, state_len), where state_len >= width - 1 weight: (dim, width) bias: (dim,) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 09479012a03ad..d15776a39362d 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -583,7 +583,7 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, inputs_embeds: Optional tensor of input embeddings. Info: - [Mistral3ImagePixelInputs][] + [`Mistral3ImagePixelInputs`][vllm.model_executor.models.mistral3.Mistral3ImagePixelInputs] """ if intermediate_tensors is not None: inputs_embeds = None diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index bad6c0c3d9db2..fbbc55d3524ca 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -301,7 +301,7 @@ class MultiModalProfiler(Generic[_I]): Returns the maximum length of the multimodal (image placeholders+text) tokens, including any break/text tokens in-between image embeddings. - [IMG] [IMG] [IMG] [IMG] [IMG] [IMG] + ` [IMG] [IMG] [IMG] [IMG] [IMG] [IMG] ` Returns 9, even when the number of image embeddings is 6. This is important to take into account when profiling and diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 3a0fbb5e5c41e..401327f727a4a 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -24,8 +24,9 @@ class KVCacheBlocks: """ blocks: tuple[list[KVCacheBlock], ...] """ - blocks[i][j] refers to the i-th kv_cache_group and the j-th block of tokens. - We don't use block of tokens as the outer dimension because it assumes all + `blocks[i][j]` refers to the i-th kv_cache_group + and the j-th block of tokens.We don't use block of + tokens as the outer dimension because it assumes all kv_cache_groups have the same number of blocks, which is true for now but will be broken if we want to give different block_size to different kv_cache_groups in the future. From 29283e89762a3d572c504e5ea317351696b553a6 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Thu, 18 Sep 2025 05:20:27 -0400 Subject: [PATCH 21/58] [Chore] Cleanup guided namespace, move to structured outputs config (#22772) Signed-off-by: Aaron Pham Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../scripts/hardware_ci/run-amd-test.sh | 6 - .buildkite/test-pipeline.yaml | 3 +- .github/mergify.yml | 2 +- .../benchmark_serving_structured_output.py | 16 +- docs/api/README.md | 2 +- docs/features/reasoning_outputs.md | 10 +- docs/features/structured_outputs.md | 36 +-- docs/features/tool_calling.md | 11 +- docs/serving/openai_compatible_server.md | 4 +- .../offline_inference/structured_outputs.py | 54 ++--- ...t_completion_client_with_tools_required.py | 2 +- .../structured_outputs/structured_outputs.py | 8 +- tests/entrypoints/conftest.py | 2 +- tests/entrypoints/llm/test_lazy_outlines.py | 82 ------- tests/entrypoints/openai/test_chat.py | 123 +++++------ tests/entrypoints/openai/test_completion.py | 79 ++++--- .../test_completion_with_function_calling.py | 4 +- .../entrypoints/openai/test_openai_schema.py | 8 +- .../openai/test_prompt_validation.py | 2 +- tests/entrypoints/openai/test_serving_chat.py | 4 - .../openai/test_transcription_validation.py | 2 +- .../openai/test_translation_validation.py | 2 +- tests/test_sampling_params.py | 84 ------- tests/tool_use/test_tool_choice_required.py | 11 +- tests/v1/core/test_scheduler.py | 6 +- tests/v1/engine/test_llm_engine.py | 4 +- tests/v1/entrypoints/conftest.py | 2 +- .../llm/test_struct_output_generate.py | 135 ++++++------ .../openai/test_chat_completion.py | 14 +- .../v1/entrypoints/openai/test_completion.py | 14 +- vllm/config/__init__.py | 35 +-- vllm/engine/arg_utils.py | 95 ++++---- vllm/engine/async_llm_engine.py | 7 +- vllm/engine/llm_engine.py | 18 +- vllm/engine/protocol.py | 7 +- vllm/entrypoints/llm.py | 27 ++- vllm/entrypoints/openai/api_server.py | 10 +- vllm/entrypoints/openai/protocol.py | 206 ++++++------------ vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/model_executor/models/config.py | 6 +- vllm/sampling_params.py | 62 ++---- vllm/transformers_utils/tokenizers/mistral.py | 5 +- vllm/v1/engine/async_llm.py | 3 - vllm/v1/engine/processor.py | 57 +++-- vllm/v1/request.py | 2 +- vllm/v1/structured_output/__init__.py | 13 +- vllm/v1/structured_output/backend_guidance.py | 4 +- .../backend_lm_format_enforcer.py | 22 +- vllm/v1/structured_output/backend_outlines.py | 32 +-- vllm/v1/structured_output/backend_xgrammar.py | 38 ++-- vllm/v1/structured_output/request.py | 2 +- 51 files changed, 579 insertions(+), 806 deletions(-) delete mode 100644 tests/entrypoints/llm/test_lazy_outlines.py delete mode 100644 tests/test_sampling_params.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index c395011a24485..7f90181048d0f 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then --ignore=entrypoints/llm/test_prompt_validation.py "} fi -#Obsolete currently -##ignore certain Entrypoints/llm tests -#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then -# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "} -#fi - # --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_oot_registration.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8dd99bf1a38f6..66dfc990805f2 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -108,8 +108,7 @@ steps: - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests diff --git a/.github/mergify.yml b/.github/mergify.yml index f2dd2e06214ae..94198b1251e09 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -171,7 +171,7 @@ pull_request_rules: - files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - files~=^tests/v1/structured_output/ - - files=tests/v1/entrypoints/llm/test_guided_generate.py + - files=tests/v1/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ actions: label: diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 4aae755eb4e44..73b4aa5a87e07 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -696,11 +696,11 @@ def evaluate(ret, args): return re.match(args.regex, actual) is not None def _eval_correctness(expected, actual): - if args.structure_type == "guided_json": + if args.structure_type == "json": return _eval_correctness_json(expected, actual) - elif args.structure_type == "guided_regex": + elif args.structure_type == "regex": return _eval_correctness_regex(expected, actual) - elif args.structure_type == "guided_choice": + elif args.structure_type == "choice": return _eval_correctness_choice(expected, actual) else: return None @@ -780,18 +780,18 @@ def main(args: argparse.Namespace): ) if args.dataset == "grammar": - args.structure_type = "guided_grammar" + args.structure_type = "grammar" elif args.dataset == "regex": - args.structure_type = "guided_regex" + args.structure_type = "regex" elif args.dataset == "choice": - args.structure_type = "guided_choice" + args.structure_type = "choice" else: - args.structure_type = "guided_json" + args.structure_type = "json" if args.no_structured_output: args.structured_output_ratio = 0 if args.save_results: - result_file_name = f"{args.structured_output_ratio}guided" + result_file_name = f"{args.structured_output_ratio}so" result_file_name += f"_{backend}" result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.model.split('/')[-1]}" diff --git a/docs/api/README.md b/docs/api/README.md index 57142e8f5625d..148211756480c 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes. - [vllm.config.LoRAConfig][] - [vllm.config.MultiModalConfig][] - [vllm.config.PoolerConfig][] -- [vllm.config.DecodingConfig][] +- [vllm.config.StructuredOutputsConfig][] - [vllm.config.ObservabilityConfig][] - [vllm.config.KVTransferConfig][] - [vllm.config.CompilationConfig][] diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index d518e7f0cff43..85681669dfb22 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models: | Model Series | Parser Name | Structured Output Support | Tool Calling | |--------------|-------------|------------------|-------------| -| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ | -| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ | +| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ | +| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ | | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | -| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ | -| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ | -| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ | +| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ | +| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ | +| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ | !!! note IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 0d6294a5fdd79..1f955c6e30d6c 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla The following parameters are supported, which must be added as extra parameters: -- `guided_choice`: the output will be exactly one of the choices. -- `guided_regex`: the output will follow the regex pattern. -- `guided_json`: the output will follow the JSON schema. -- `guided_grammar`: the output will follow the context free grammar. +- `choice`: the output will be exactly one of the choices. +- `regex`: the output will follow the regex pattern. +- `json`: the output will follow the JSON schema. +- `grammar`: the output will follow the context free grammar. - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text. You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page. Structured outputs are supported by default in the OpenAI-Compatible Server. You may choose to specify the backend to use by setting the -`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`, +`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`, which will try to choose an appropriate backend based on the details of the request. You may also choose a specific backend, along with some options. A full set of options is available in the `vllm serve --help` text. -Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: +Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one: ??? code @@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic messages=[ {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} ], - extra_body={"guided_choice": ["positive", "negative"]}, + extra_body={"structured_outputs": {"choice": ["positive", "negative"]}}, ) print(completion.choices[0].message.content) ``` -The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: +The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template: ??? code @@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", } ], - extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, + extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]}, ) print(completion.choices[0].message.content) ``` One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. -For this we can use the `guided_json` parameter in two different ways: +For this we can use the `json` parameter in two different ways: - Using directly a [JSON Schema](https://json-schema.org/) - Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option). -The next example shows how to use the `guided_json` parameter with a Pydantic model: +The next example shows how to use the `response_format` parameter with a Pydantic model: ??? code @@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo JSON schema and how the fields should be populated. This can improve the results notably in most cases. -Finally we have the `guided_grammar` option, which is probably the most +Finally we have the `grammar` option, which is probably the most difficult to use, but it´s really powerful. It allows us to define complete languages like SQL queries. It works by using a context free EBNF grammar. As an example, we can use to define a specific format of simplified SQL queries: @@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries: "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", } ], - extra_body={"guided_grammar": simplified_sql_grammar}, + extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}}, ) print(completion.choices[0].message.content) ``` @@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: str: @pytest.mark.skip_global_cleanup @pytest.mark.parametrize( - "model_name, guided_decoding_backend, tokenizer_mode, speculative_config", + "model_name, backend, tokenizer_mode, speculative_config", PARAMS_MODELS_BACKENDS_TOKENIZER_MODE) def test_structured_output( monkeypatch: pytest.MonkeyPatch, @@ -99,8 +100,8 @@ def test_structured_output( sample_sql_ebnf: str, sample_sql_lark: str, sample_regex: str, - sample_guided_choice: str, - guided_decoding_backend: str, + sample_structured_outputs_choices: str, + backend: str, tokenizer_mode: str, model_name: str, speculative_config: dict[str, Any], @@ -115,16 +116,15 @@ def test_structured_output( enforce_eager = bool(not current_platform.is_tpu()) # Use a single LLM instance for several scenarios to # speed up the test suite. - llm = LLM( - model=model_name, - enforce_eager=enforce_eager, - max_model_len=1024, - guided_decoding_backend=guided_decoding_backend, - guided_decoding_disable_any_whitespace=(guided_decoding_backend - in {"xgrammar", "guidance"}), - seed=120, - tokenizer_mode=tokenizer_mode, - speculative_config=speculative_config) + llm = LLM(model=model_name, + enforce_eager=enforce_eager, + max_model_len=1024, + structured_outputs_config=dict(backend=backend, + disable_any_whitespace=backend + in {"xgrammar", "guidance"}), + seed=120, + tokenizer_mode=tokenizer_mode, + speculative_config=speculative_config) # # Test 1: Generate JSON output based on a provided schema @@ -132,7 +132,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams(json=sample_json_schema)) + structured_outputs=StructuredOutputsParams(json=sample_json_schema)) prompt = ("Give an example JSON for an employee profile that fits this " "schema. Make the response as short as possible. Schema: " @@ -152,7 +152,7 @@ def test_structured_output( generated_text = output.outputs[0].text assert generated_text is not None - if guided_decoding_backend != 'lm-format-enforcer': + if backend != 'lm-format-enforcer': assert "\n" not in generated_text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") output_json = json.loads(generated_text) @@ -161,12 +161,12 @@ def test_structured_output( # # Test 2: Generate JSON object without a schema # - if guided_decoding_backend != "outlines": + if backend != "outlines": sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, n=2, - guided_decoding=GuidedDecodingParams(json_object=True)) + structured_outputs=StructuredOutputsParams(json_object=True)) outputs = llm.generate(prompts=( "Generate a JSON object with curly braces for a person with " @@ -195,8 +195,9 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams(json=unsupported_json_schema)) - if guided_decoding_backend.startswith("xgrammar"): + structured_outputs=StructuredOutputsParams( + json=unsupported_json_schema)) + if backend.startswith("xgrammar"): with pytest.raises(ValueError, match="The provided JSON schema contains features " "not supported by xgrammar."): @@ -230,7 +231,7 @@ def test_structured_output( parsed_json = json.loads(generated_text) assert isinstance(parsed_json, dict) - if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]: + if backend not in ["outlines", "lm-format-enforcer"]: # # Test 4: Generate SQL statement using EBNF grammar # @@ -238,7 +239,8 @@ def test_structured_output( temperature=0.8, top_p=0.95, max_tokens=1000, - guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf)) + structured_outputs=StructuredOutputsParams( + grammar=sample_sql_ebnf)) outputs = llm.generate( ("Generate a sql statement that selects col_1 from " "table_1 where it is equal to 1. Make the response as short as " @@ -271,7 +273,8 @@ def test_structured_output( temperature=0.8, top_p=0.95, max_tokens=1000, - guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark)) + structured_outputs=StructuredOutputsParams( + grammar=sample_sql_lark)) outputs = llm.generate( ("Generate a sql statement that selects col_1 from " "table_1 where it is equal to 1. Make the response as short as " @@ -309,7 +312,8 @@ def test_structured_output( temperature=0.8, top_p=0.95, max_tokens=1000, - guided_decoding=GuidedDecodingParams(grammar="not a grammar")) + structured_outputs=StructuredOutputsParams( + grammar="not a grammar")) with pytest.raises(ValueError, match="Failed to convert the grammar "): llm.generate( ("Generate a sql statement that selects col_1 from " @@ -325,7 +329,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=0.8, top_p=0.95, - guided_decoding=GuidedDecodingParams(regex=sample_regex)) + structured_outputs=StructuredOutputsParams(regex=sample_regex)) prompt = (f"Give an example IPv4 address with this regex: {sample_regex}. " f"Make the response as short as possible.") @@ -352,7 +356,8 @@ def test_structured_output( sampling_params = SamplingParams( temperature=0.8, top_p=0.95, - guided_decoding=GuidedDecodingParams(choice=sample_guided_choice)) + structured_outputs=StructuredOutputsParams( + choice=sample_structured_outputs_choices)) outputs = llm.generate( ("The best language for type-safe systems programming is " @@ -368,7 +373,7 @@ def test_structured_output( generated_text = output.outputs[0].text print(generated_text) assert generated_text is not None - assert generated_text in sample_guided_choice + assert generated_text in sample_structured_outputs_choices print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") # @@ -378,7 +383,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=1000, - guided_decoding=GuidedDecodingParams(json=json_schema)) + structured_outputs=StructuredOutputsParams(json=json_schema)) outputs = llm.generate( ("Generate a JSON with the brand, model and car_type of the most " @@ -422,7 +427,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams(json=json_schema)) + structured_outputs=StructuredOutputsParams(json=json_schema)) outputs = llm.generate( ("Generate a description of a frog using 50 characters. " @@ -444,7 +449,7 @@ def test_structured_output( output_json = json.loads(generated_text) jsonschema.validate(instance=output_json, schema=json_schema) - if guided_decoding_backend not in ["outlines", "lm-format-enforcer"]: + if backend not in ["outlines", "lm-format-enforcer"]: # # Test 11: Generate structured output using structural_tag format # @@ -470,7 +475,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=0.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams( + structured_outputs=StructuredOutputsParams( structural_tag=json.dumps(structural_tag_config))) prompt = """ @@ -547,7 +552,7 @@ Make the response as short as possible. @pytest.mark.skip_global_cleanup @pytest.mark.parametrize( - "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501 + "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501 [ ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto", "deepseek_r1", NGRAM_SPEC_CONFIG), @@ -556,7 +561,7 @@ Make the response as short as possible. ) def test_structured_output_with_reasoning_matrices( monkeypatch: pytest.MonkeyPatch, - guided_decoding_backend: str, + backend: str, tokenizer_mode: TokenizerMode, reasoning_parser: str, model_name: str, @@ -576,10 +581,11 @@ def test_structured_output_with_reasoning_matrices( enforce_eager=bool(not current_platform.is_tpu()), max_model_len=1024, max_num_seqs=16, - guided_decoding_backend=guided_decoding_backend, - guided_decoding_disable_any_whitespace=True, + structured_outputs_config=dict(backend=backend, + disable_any_whitespace=backend + in {"xgrammar", "guidance"}, + reasoning_parser=reasoning_parser), tokenizer_mode=tokenizer_mode, - reasoning_parser=reasoning_parser, speculative_config=speculative_config, ) tokenizer = llm.get_tokenizer() @@ -603,7 +609,7 @@ def test_structured_output_with_reasoning_matrices( sampling_params = SamplingParams( temperature=0.1, max_tokens=8192, - guided_decoding=GuidedDecodingParams(json=reasoning_schema), + structured_outputs=StructuredOutputsParams(json=reasoning_schema), ) outputs = llm.generate( [reasoning_prompt], @@ -640,13 +646,14 @@ def test_structured_output_auto_mode( llm = LLM(model=model_name, max_model_len=1024, - guided_decoding_backend="auto", + structured_outputs_config=dict(backend="auto"), tokenizer_mode=tokenizer_mode) sampling_params = SamplingParams( temperature=1.0, max_tokens=1000, - guided_decoding=GuidedDecodingParams(json=unsupported_json_schema)) + structured_outputs=StructuredOutputsParams( + json=unsupported_json_schema)) prompts = ( "Give an example JSON object for a grade " @@ -681,9 +688,10 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct", max_model_len=1024, - guided_decoding_backend="guidance", - guided_decoding_disable_any_whitespace=True, - guided_decoding_disable_additional_properties=True) + structured_outputs_config=dict( + backend="guidance", + disable_any_whitespace=True, + disable_additional_properties=True)) schema = { 'type': 'object', @@ -709,14 +717,15 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): "<|im_end|>\n<|im_start|>assistant\n") def generate_with_backend(backend): - guided_params = GuidedDecodingParams( + structured_outputs_params = StructuredOutputsParams( json=schema, backend=backend, disable_any_whitespace=True, disable_additional_properties=True) - sampling_params = SamplingParams(temperature=0, - max_tokens=256, - guided_decoding=guided_params) + sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + structured_outputs=structured_outputs_params) outputs = llm.generate(prompt, sampling_params=sampling_params) assert outputs is not None @@ -736,12 +745,11 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): assert "a6" not in generated -@pytest.mark.parametrize("guided_decoding_backend", - ["guidance", "xgrammar", "outlines"]) -def test_structured_output_batched_with_non_guided_requests( +@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"]) +def test_structured_output_batched_with_non_structured_outputs_requests( monkeypatch: pytest.MonkeyPatch, sample_json_schema: dict[str, Any], - guided_decoding_backend: str, + backend: str, ): monkeypatch.setenv("VLLM_USE_V1", "1") @@ -753,24 +761,25 @@ def test_structured_output_batched_with_non_guided_requests( model="meta-llama/Meta-Llama-3.1-8B-Instruct", enforce_eager=enforce_eager, max_model_len=1024, - guided_decoding_backend=guided_decoding_backend, - guided_decoding_disable_any_whitespace=(guided_decoding_backend - in {"xgrammar", "guidance"}), + structured_outputs_config=StructuredOutputsConfig( + backend=backend, + disable_any_whitespace=backend in {"xgrammar", "guidance"}, + ), ) - guided_prompt = ( + structured_outputs_prompt = ( "Give an example JSON for an employee profile that fits this " "schema. Make the response as short as possible. Schema: " f"{sample_json_schema}") - non_guided_prompt = "The diameter of the Earth in kilometers is " + non_structured_outputs_prompt = "The diameter of the Earth in kilometers is " - prompts = [guided_prompt, non_guided_prompt] + prompts = [structured_outputs_prompt, non_structured_outputs_prompt] sampling_params = [ - SamplingParams( - temperature=1.0, - max_tokens=400, - guided_decoding=GuidedDecodingParams(json=sample_json_schema)), + SamplingParams(temperature=1.0, + max_tokens=400, + structured_outputs=StructuredOutputsParams( + json=sample_json_schema)), # No max tokens, temp=0 to assert on contents SamplingParams( seed=42, @@ -801,16 +810,16 @@ def test_structured_output_batched_with_non_guided_requests( print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}") if index == 0: - # First prompt is guided, expect valid JSON + # First prompt is structured outputs, expect valid JSON assert "\n" not in generated_text output_json = json.loads(generated_text) jsonschema.validate(instance=output_json, schema=sample_json_schema) else: - # Second prompt is not guided, expect valid output + # Second prompt is not structured outputs, expect valid output # Cannot assert on exact output, but we can expect it to be factual assert "12,742" in generated_text - # non-guided requests should not return a valid JSON here + # non-structured outputs requests should not return a valid JSON here with pytest.raises(ValueError): output_json = json.loads(generated_text) diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py index dffb32846c05e..9aa285aa9b18d 100644 --- a/tests/v1/entrypoints/openai/test_chat_completion.py +++ b/tests/v1/entrypoints/openai/test_chat_completion.py @@ -77,7 +77,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI, "role": "user", "content": prompt, }], - extra_body={"guided_json": invalid_json_schema}, + extra_body={"structured_outputs": { + "json": invalid_json_schema + }}, ) @@ -99,7 +101,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str): "content": prompt, }], extra_body={ - "guided_regex": r"[.*", + "structured_outputs": { + "regex": r"[.*" + }, "stop": ["\n"] }, ) @@ -134,5 +138,9 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str): "role": "user", "content": prompt, }], - extra_body={"guided_grammar": invalid_simplified_sql_grammar}, + extra_body={ + "structured_outputs": { + "grammar": invalid_simplified_sql_grammar + } + }, ) diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 3114d7639f045..9090beb4bbd2a 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -627,7 +627,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI, await client.completions.create( model=model_name, prompt=prompt, - extra_body={"guided_json": invalid_json_schema}, + extra_body={"structured_outputs": { + "json": invalid_json_schema + }}, ) @@ -646,7 +648,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str): model=model_name, prompt=prompt, extra_body={ - "guided_regex": r"[.*", + "structured_outputs": { + "regex": r"[.*" + }, "stop": ["\n"] }, ) @@ -678,7 +682,11 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str): await client.completions.create( model=model_name, prompt=prompt, - extra_body={"guided_grammar": invalid_simplified_sql_grammar}, + extra_body={ + "structured_outputs": { + "grammar": invalid_simplified_sql_grammar + } + }, ) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 631618d427d42..9a1c5f0b0d453 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2277,34 +2277,34 @@ def get_served_model_name(model: str, return served_model_name -GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines", - "lm-format-enforcer"] +StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines", + "lm-format-enforcer"] @config @dataclass -class DecodingConfig: - """Dataclass which contains the decoding strategy of the engine.""" +class StructuredOutputsConfig: + """Dataclass which contains structured outputs config for the engine.""" - backend: GuidedDecodingBackend = "auto" - """Which engine will be used for guided decoding (JSON schema / regex etc) - by default. With "auto", we will make opinionated choices based on request - contents and what the backend libraries currently support, so the behavior - is subject to change in each release.""" + backend: StructuredOutputsBackend = "auto" + """Which engine will be used for structured outputs (e.g. JSON schema, + regex, etc) by default. With "auto", we will make opinionated choices + based on request contents and what the backend libraries currently support, + so the behavior is subject to change in each release.""" disable_fallback: bool = False """If `True`, vLLM will not fallback to a different backend on error.""" disable_any_whitespace: bool = False - """If `True`, the model will not generate any whitespace during guided - decoding. This is only supported for xgrammar and guidance backends.""" + """If `True`, the model will not generate any whitespace during structured + outputs. This is only supported for xgrammar and guidance backends.""" disable_additional_properties: bool = False """If `True`, the `guidance` backend will not use `additionalProperties` in the JSON schema. This is only supported for the `guidance` backend and is used to better align its behaviour with `outlines` and `xgrammar`.""" - reasoning_backend: str = "" + reasoning_parser: str = "" """Select the reasoning parser depending on the model that you're using. This is used to parse the reasoning content into OpenAI API format.""" @@ -2451,8 +2451,9 @@ class VllmConfig: """LoRA configuration.""" speculative_config: Optional[SpeculativeConfig] = None """Speculative decoding configuration.""" - decoding_config: DecodingConfig = field(default_factory=DecodingConfig) - """Decoding configuration.""" + structured_outputs_config: StructuredOutputsConfig = field( + default_factory=StructuredOutputsConfig) + """Structured outputs configuration.""" observability_config: Optional[ObservabilityConfig] = None """Observability configuration.""" quant_config: Optional[QuantizationConfig] = None @@ -2543,8 +2544,8 @@ class VllmConfig: vllm_factors.append(self.speculative_config.compute_hash()) else: vllm_factors.append("None") - if self.decoding_config: - vllm_factors.append(self.decoding_config.compute_hash()) + if self.structured_outputs_config: + vllm_factors.append(self.structured_outputs_config.compute_hash()) else: vllm_factors.append("None") if self.observability_config: @@ -3063,7 +3064,7 @@ class VllmConfig: f"enforce_eager={self.model_config.enforce_eager}, " f"kv_cache_dtype={self.cache_config.cache_dtype}, " f"device_config={self.device_config.device}, " - f"decoding_config={self.decoding_config!r}, " + f"structured_outputs_config={self.structured_outputs_config!r}, " f"observability_config={self.observability_config!r}, " f"seed={self.model_config.seed}, " f"served_model_name={self.model_config.served_model_name}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e2a1ec68e6f53..fb5beab77b270 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -22,17 +22,16 @@ from typing_extensions import TypeIs, deprecated import vllm.envs as envs from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, - ConfigType, ConvertOption, DecodingConfig, - DetailedTraceModules, Device, DeviceConfig, - DistributedExecutorBackend, EPLBConfig, - GuidedDecodingBackend, HfOverrides, KVEventsConfig, + ConfigType, ConvertOption, DetailedTraceModules, + Device, DeviceConfig, DistributedExecutorBackend, + EPLBConfig, HfOverrides, KVEventsConfig, KVTransferConfig, LoadConfig, LogprobsMode, LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig, ModelDType, ModelImpl, ObservabilityConfig, ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, RunnerOption, SchedulerConfig, SchedulerPolicy, - SpeculativeConfig, TaskOption, TokenizerMode, - VllmConfig, get_attr_docs) + SpeculativeConfig, StructuredOutputsConfig, + TaskOption, TokenizerMode, VllmConfig, get_attr_docs) from vllm.config.multimodal import MMCacheType, MultiModalConfig from vllm.config.parallel import ExpertPlacementStrategy from vllm.config.utils import get_field @@ -418,12 +417,15 @@ class EngineArgs: disable_hybrid_kv_cache_manager: bool = ( SchedulerConfig.disable_hybrid_kv_cache_manager) - guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend - guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback - guided_decoding_disable_any_whitespace: bool = \ - DecodingConfig.disable_any_whitespace - guided_decoding_disable_additional_properties: bool = \ - DecodingConfig.disable_additional_properties + structured_outputs_config: StructuredOutputsConfig = get_field( + VllmConfig, "structured_outputs_config") + reasoning_parser: str = StructuredOutputsConfig.reasoning_parser + # Deprecated guided decoding fields + guided_decoding_backend: Optional[str] = None + guided_decoding_disable_fallback: Optional[bool] = None + guided_decoding_disable_any_whitespace: Optional[bool] = None + guided_decoding_disable_additional_properties: Optional[bool] = None + logits_processor_pattern: Optional[ str] = ModelConfig.logits_processor_pattern @@ -462,7 +464,6 @@ class EngineArgs: additional_config: dict[str, Any] = \ get_field(VllmConfig, "additional_config") - reasoning_parser: str = DecodingConfig.reasoning_backend use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location @@ -618,28 +619,29 @@ class EngineArgs: load_group.add_argument('--pt-load-map-location', **load_kwargs["pt_load_map_location"]) - # Guided decoding arguments - guided_decoding_kwargs = get_kwargs(DecodingConfig) - guided_decoding_group = parser.add_argument_group( - title="DecodingConfig", - description=DecodingConfig.__doc__, + # Structured outputs arguments + structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig) + structured_outputs_group = parser.add_argument_group( + title="StructuredOutputsConfig", + description=StructuredOutputsConfig.__doc__, ) - guided_decoding_group.add_argument("--guided-decoding-backend", - **guided_decoding_kwargs["backend"]) - guided_decoding_group.add_argument( - "--guided-decoding-disable-fallback", - **guided_decoding_kwargs["disable_fallback"]) - guided_decoding_group.add_argument( - "--guided-decoding-disable-any-whitespace", - **guided_decoding_kwargs["disable_any_whitespace"]) - guided_decoding_group.add_argument( - "--guided-decoding-disable-additional-properties", - **guided_decoding_kwargs["disable_additional_properties"]) - guided_decoding_group.add_argument( + structured_outputs_group.add_argument( "--reasoning-parser", # This choice is a special case because it's not static choices=list(ReasoningParserManager.reasoning_parsers), - **guided_decoding_kwargs["reasoning_backend"]) + **structured_outputs_kwargs["reasoning_parser"]) + # Deprecated guided decoding arguments + for arg, type in [ + ("--guided-decoding-backend", str), + ("--guided-decoding-disable-fallback", bool), + ("--guided-decoding-disable-any-whitespace", bool), + ("--guided-decoding-disable-additional-properties", bool), + ]: + structured_outputs_group.add_argument( + arg, + type=type, + help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."), + deprecated=True) # Parallel arguments parallel_kwargs = get_kwargs(ParallelConfig) @@ -934,6 +936,8 @@ class EngineArgs: **vllm_kwargs["compilation_config"]) vllm_group.add_argument("--additional-config", **vllm_kwargs["additional_config"]) + vllm_group.add_argument('--structured-outputs-config', + **vllm_kwargs["structured_outputs_config"]) # Other arguments parser.add_argument('--disable-log-stats', @@ -1421,14 +1425,25 @@ class EngineArgs: load_config = self.create_load_config() - decoding_config = DecodingConfig( - backend=self.guided_decoding_backend, - disable_fallback=self.guided_decoding_disable_fallback, - disable_any_whitespace=self.guided_decoding_disable_any_whitespace, - disable_additional_properties=\ - self.guided_decoding_disable_additional_properties, - reasoning_backend=self.reasoning_parser - ) + # Pass reasoning_parser into StructuredOutputsConfig + if self.reasoning_parser: + self.structured_outputs_config.reasoning_parser = \ + self.reasoning_parser + + # Forward the deprecated CLI args to the StructuredOutputsConfig + so_config = self.structured_outputs_config + if self.guided_decoding_backend is not None: + so_config.guided_decoding_backend = \ + self.guided_decoding_backend + if self.guided_decoding_disable_fallback is not None: + so_config.guided_decoding_disable_fallback = \ + self.guided_decoding_disable_fallback + if self.guided_decoding_disable_any_whitespace is not None: + so_config.guided_decoding_disable_any_whitespace = \ + self.guided_decoding_disable_any_whitespace + if self.guided_decoding_disable_additional_properties is not None: + so_config.guided_decoding_disable_additional_properties = \ + self.guided_decoding_disable_additional_properties observability_config = ObservabilityConfig( show_hidden_metrics_for_version=( @@ -1446,7 +1461,7 @@ class EngineArgs: lora_config=lora_config, speculative_config=speculative_config, load_config=load_config, - decoding_config=decoding_config, + structured_outputs_config=self.structured_outputs_config, observability_config=observability_config, compilation_config=self.compilation_config, kv_transfer_config=self.kv_transfer_config, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 1ae82c9f6f6f9..6793041abc502 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -10,9 +10,8 @@ from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, from weakref import ReferenceType import vllm.envs as envs -from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig, +from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig) -from vllm.config.lora import LoRAConfig from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout @@ -955,10 +954,6 @@ class AsyncLLMEngine(EngineClient): """Get the parallel configuration of the vLLM engine.""" return self.engine.get_parallel_config() - async def get_decoding_config(self) -> DecodingConfig: - """Get the decoding configuration of the vLLM engine.""" - return self.engine.get_decoding_config() - async def get_scheduler_config(self) -> SchedulerConfig: """Get the scheduling configuration of the vLLM engine.""" return self.engine.get_scheduler_config() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 34b5dcb587503..708f3bbeeff15 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -16,9 +16,8 @@ import torch from typing_extensions import TypeVar import vllm.envs as envs -from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig, +from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, SchedulerConfig, VllmConfig) -from vllm.config.lora import LoRAConfig from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics_types import StatLoggerBase, Stats @@ -213,8 +212,7 @@ class LLMEngine: self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config # noqa self.load_config = vllm_config.load_config - self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa - ) + self.structured_outputs_config = vllm_config.structured_outputs_config self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa ) @@ -364,10 +362,9 @@ class LLMEngine: self.observability_config.otlp_traces_endpoint) # Initialize reasoning parser if reasoning backend is set. - if self.decoding_config.reasoning_backend and \ - self.tokenizer: + if self.structured_outputs_config.reasoning_parser and self.tokenizer: reasoner_class = ReasoningParserManager.get_reasoning_parser( - self.decoding_config.reasoning_backend) + self.structured_outputs_config.reasoning_parser) self.reasoner: ReasoningParser = reasoner_class( self.tokenizer.get_lora_tokenizer()) @@ -381,7 +378,8 @@ class LLMEngine: self.seq_counter, stop_checker=StopChecker( self.scheduler_config.max_model_len, - self.reasoner if self.decoding_config.reasoning_backend + self.reasoner + if self.structured_outputs_config.reasoning_parser and self.tokenizer else None, ), )) @@ -772,10 +770,6 @@ class LLMEngine: """Gets the parallel configuration.""" return self.parallel_config - def get_decoding_config(self) -> DecodingConfig: - """Gets the decoding configuration.""" - return self.decoding_config - def get_scheduler_config(self) -> SchedulerConfig: """Gets the scheduler configuration.""" return self.scheduler_config diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 808d2d0ce3d28..c345f17e6614f 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -6,7 +6,7 @@ from abc import ABC, abstractmethod from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function -from vllm.config import DecodingConfig, ModelConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import is_explicit_encoder_decoder_prompt @@ -248,11 +248,6 @@ class EngineClient(ABC): """Get the model configuration of the vLLM engine.""" ... - @abstractmethod - async def get_decoding_config(self) -> DecodingConfig: - """Get the decoding configuration of the vLLM engine.""" - ... - @abstractmethod async def get_input_preprocessor(self) -> InputPreprocessor: """Get the input processor of the vLLM engine.""" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f2264292fa660..63e9478612bb1 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -15,8 +15,8 @@ import vllm.envs as envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, create_sort_beams_key_function) -from vllm.config import (CompilationConfig, ModelDType, TokenizerMode, - is_init_field) +from vllm.config import (CompilationConfig, ModelDType, + StructuredOutputsConfig, TokenizerMode, is_init_field) from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides, PoolerConfig, RunnerOption) from vllm.engine.llm_engine import LLMEngine @@ -192,6 +192,8 @@ class LLM: hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, override_pooler_config: Optional[PoolerConfig] = None, + structured_outputs_config: Optional[Union[dict[ + str, Any], StructuredOutputsConfig]] = None, kv_cache_memory_bytes: Optional[int] = None, compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None, @@ -236,14 +238,30 @@ class LLM: compilation_config_instance = CompilationConfig( level=compilation_config) elif isinstance(compilation_config, dict): - predicate = lambda x: is_init_field(CompilationConfig, x[0]) compilation_config_instance = CompilationConfig( - **dict(filter(predicate, compilation_config.items()))) + **{ + k: v + for k, v in compilation_config.items() + if is_init_field(CompilationConfig, k) + }) else: compilation_config_instance = compilation_config else: compilation_config_instance = CompilationConfig() + if structured_outputs_config is not None: + if isinstance(structured_outputs_config, dict): + structured_outputs_instance = StructuredOutputsConfig( + **{ + k: v + for k, v in structured_outputs_config.items() + if is_init_field(StructuredOutputsConfig, k) + }) + else: + structured_outputs_instance = structured_outputs_config + else: + structured_outputs_instance = StructuredOutputsConfig() + engine_args = EngineArgs( model=model, runner=runner, @@ -271,6 +289,7 @@ class LLM: hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, override_pooler_config=override_pooler_config, + structured_outputs_config=structured_outputs_instance, compilation_config=compilation_config_instance, logits_processors=logits_processors, **kwargs, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c07e95e9370a0..93ea846f26f6c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1678,7 +1678,7 @@ async def init_app_state( enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, tool_server=tool_server, - reasoning_parser=args.reasoning_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, @@ -1697,7 +1697,7 @@ async def init_app_state( exclude_tools_when_tool_choice_none=args. exclude_tools_when_tool_choice_none, tool_parser=args.tool_call_parser, - reasoning_parser=args.reasoning_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, @@ -1800,10 +1800,10 @@ def validate_api_server_args(args): f"(chose from {{ {','.join(valid_tool_parses)} }})") valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys() - if args.reasoning_parser \ - and args.reasoning_parser not in valid_reasoning_parses: + if ((reasoning_parser := args.structured_outputs_config.reasoning_parser) + and reasoning_parser not in valid_reasoning_parses): raise KeyError( - f"invalid reasoning parser: {args.reasoning_parser} " + f"invalid reasoning parser: {reasoning_parser} " f"(chose from {{ {','.join(valid_reasoning_parses)} }})") diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 2505e493625d8..cff4a45fdc43e 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -54,8 +54,8 @@ from vllm.entrypoints.score_utils import (ScoreContentPartParam, from vllm.logger import init_logger from vllm.logprobs import Logprob from vllm.pooling_params import PoolingParams -from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, - RequestOutputKind, SamplingParams) +from vllm.sampling_params import (BeamSearchParams, RequestOutputKind, + SamplingParams, StructuredOutputsParams) from vllm.utils import random_uuid, resolve_obj_by_qualname logger = init_logger(__name__) @@ -373,11 +373,12 @@ class ResponsesRequest(OpenAIBaseModel): stop_token_ids = default_sampling_params.get("stop_token_ids") # Structured output - guided_decoding = None + structured_outputs = None if self.text is not None and self.text.format is not None: response_format = self.text.format - if response_format.type == "json_schema": - guided_decoding = GuidedDecodingParams.from_optional( + if (response_format.type == "json_schema" + and response_format.schema_ is not None): + structured_outputs = StructuredOutputsParams( json=response_format.schema_) elif response_format.type == "json_object": raise NotImplementedError("json_object is not supported") @@ -392,7 +393,7 @@ class ResponsesRequest(OpenAIBaseModel): stop_token_ids=stop_token_ids, output_kind=(RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY), - guided_decoding=guided_decoding, + structured_outputs=structured_outputs, ) def is_include_output_logprobs(self) -> bool: @@ -547,42 +548,9 @@ class ChatCompletionRequest(OpenAIBaseModel): default=None, description=("Additional kwargs to pass to the HF processor."), ) - guided_json: Optional[Union[str, dict, BaseModel]] = Field( + structured_outputs: Optional[StructuredOutputsParams] = Field( default=None, - description=("If specified, the output will follow the JSON schema."), - ) - guided_regex: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the regex pattern."), - ) - guided_choice: Optional[list[str]] = Field( - default=None, - description=( - "If specified, the output will be exactly one of the choices."), - ) - guided_grammar: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the context free grammar."), - ) - structural_tag: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the structural tag schema."), - ) - guided_decoding_backend: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default guided decoding backend " - "of the server for this specific request. If set, must be either " - "'outlines' / 'lm-format-enforcer'"), - ) - guided_whitespace_pattern: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default whitespace pattern " - "for guided json decoding."), + description="Additional kwargs for structured outputs", ) priority: int = Field( default=0, @@ -701,31 +669,33 @@ class ChatCompletionRequest(OpenAIBaseModel): if prompt_logprobs is None and self.echo: prompt_logprobs = self.top_logprobs - guided_json_object = None - if self.response_format is not None: - if self.response_format.type == "json_object": - guided_json_object = True - elif self.response_format.type == "json_schema": - json_schema = self.response_format.json_schema - assert json_schema is not None - self.guided_json = json_schema.json_schema - elif self.response_format.type == "structural_tag": - structural_tag = self.response_format - assert structural_tag is not None and isinstance( - structural_tag, StructuralTagResponseFormat) - s_tag_obj = structural_tag.model_dump(by_alias=True) - self.structural_tag = json.dumps(s_tag_obj) + response_format = self.response_format + json_schema_from_tool = self._get_json_schema_from_tool() + if response_format is not None or json_schema_from_tool is not None: + # If structured outputs wasn't already enabled, + # we must enable it for these features to work + if self.structured_outputs is None: + self.structured_outputs = StructuredOutputsParams() - guided_decoding = GuidedDecodingParams.from_optional( - json=self._get_guided_json_from_tool() or self.guided_json, - regex=self.guided_regex, - choice=self.guided_choice, - grammar=self.guided_grammar, - json_object=guided_json_object, - backend=self.guided_decoding_backend, - whitespace_pattern=self.guided_whitespace_pattern, - structural_tag=self.structural_tag, - ) + # Set structured output params for response format + if response_format is not None: + if response_format.type == "json_object": + self.structured_outputs.json_object = True + elif response_format.type == "json_schema": + json_schema = response_format.json_schema + assert json_schema is not None + self.structured_outputs.json = json_schema.json_schema + elif response_format.type == "structural_tag": + structural_tag = response_format + assert structural_tag is not None and isinstance( + structural_tag, StructuralTagResponseFormat) + s_tag_obj = structural_tag.model_dump(by_alias=True) + self.structured_outputs.structural_tag = json.dumps( + s_tag_obj) + + # Set structured output params for tool calling + if json_schema_from_tool is not None: + self.structured_outputs.json = json_schema_from_tool extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: @@ -757,15 +727,14 @@ class ChatCompletionRequest(OpenAIBaseModel): truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, - guided_decoding=guided_decoding, + structured_outputs=self.structured_outputs, logit_bias=self.logit_bias, - bad_words= self.bad_words, + bad_words=self.bad_words, allowed_token_ids=self.allowed_token_ids, extra_args=extra_args or None, ) - def _get_guided_json_from_tool( - self) -> Optional[Union[str, dict, BaseModel]]: + def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]: # user has chosen to not use any tool if self.tool_choice == "none" or self.tools is None: return None @@ -875,28 +844,31 @@ class ChatCompletionRequest(OpenAIBaseModel): @model_validator(mode="before") @classmethod - def check_guided_decoding_count(cls, data): + def check_structured_outputs_count(cls, data): if isinstance(data, ValueError): raise data - guide_count = sum([ - "guided_json" in data and data["guided_json"] is not None, - "guided_regex" in data and data["guided_regex"] is not None, - "guided_choice" in data and data["guided_choice"] is not None - ]) - # you can only use one kind of guided decoding - if guide_count > 1: + if "structured_outputs" not in data: + return data + + structured_outputs_kwargs = data['structured_outputs'] + count = sum( + structured_outputs_kwargs.get(k) is not None + for k in ("json", "regex", "choice")) + # you can only use one kind of constraints for structured outputs + if count > 1: raise ValueError( - "You can only use one kind of guided decoding " - "('guided_json', 'guided_regex' or 'guided_choice').") - # you can only either use guided decoding or tools, not both - if guide_count > 1 and data.get("tool_choice", "none") not in ( + "You can only use one kind of constraints for structured " + "outputs ('json', 'regex' or 'choice').") + # you can only either use structured outputs or tools, not both + if count > 1 and data.get("tool_choice", "none") not in ( "none", "auto", "required", ): raise ValueError( - "You can only either use guided decoding or tools, not both.") + "You can only either use constraints for structured outputs " + "or tools, not both.") return data @model_validator(mode="before") @@ -1049,37 +1021,9 @@ class CompletionRequest(OpenAIBaseModel): ", {'type': 'structural_tag'}, or {'type': 'text' } is supported." ), ) - guided_json: Optional[Union[str, dict, BaseModel]] = Field( + structured_outputs: Optional[StructuredOutputsParams] = Field( default=None, - description="If specified, the output will follow the JSON schema.", - ) - guided_regex: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the regex pattern."), - ) - guided_choice: Optional[list[str]] = Field( - default=None, - description=( - "If specified, the output will be exactly one of the choices."), - ) - guided_grammar: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the context free grammar."), - ) - guided_decoding_backend: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default guided decoding backend " - "of the server for this specific request. If set, must be one of " - "'outlines' / 'lm-format-enforcer'"), - ) - guided_whitespace_pattern: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default whitespace pattern " - "for guided json decoding."), + description="Additional kwargs for structured outputs", ) priority: int = Field( default=0, @@ -1210,20 +1154,10 @@ class CompletionRequest(OpenAIBaseModel): echo_without_generation = self.echo and self.max_tokens == 0 - guided_json_object = None - if (self.response_format is not None + if (self.structured_outputs is not None + and self.response_format is not None and self.response_format.type == "json_object"): - guided_json_object = True - - guided_decoding = GuidedDecodingParams.from_optional( - json=self.guided_json, - regex=self.guided_regex, - choice=self.guided_choice, - grammar=self.guided_grammar, - json_object=guided_json_object, - backend=self.guided_decoding_backend, - whitespace_pattern=self.guided_whitespace_pattern, - ) + self.structured_outputs.json_object = True extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: @@ -1255,7 +1189,7 @@ class CompletionRequest(OpenAIBaseModel): truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, - guided_decoding=guided_decoding, + structured_outputs=self.structured_outputs, logit_bias=self.logit_bias, allowed_token_ids=self.allowed_token_ids, extra_args=extra_args or None, @@ -1263,16 +1197,18 @@ class CompletionRequest(OpenAIBaseModel): @model_validator(mode="before") @classmethod - def check_guided_decoding_count(cls, data): - guide_count = sum([ - "guided_json" in data and data["guided_json"] is not None, - "guided_regex" in data and data["guided_regex"] is not None, - "guided_choice" in data and data["guided_choice"] is not None - ]) - if guide_count > 1: + def check_structured_outputs_count(cls, data): + if "structured_outputs" not in data: + return data + + structured_outputs_kwargs = data['structured_outputs'] + count = sum( + structured_outputs_kwargs.get(k) is not None + for k in ("json", "regex", "choice")) + if count > 1: raise ValueError( - "You can only use one kind of guided decoding " - "('guided_json', 'guided_regex' or 'guided_choice').") + "You can only use one kind of constraints for structured " + "outputs ('json', 'regex' or 'choice').") return data @model_validator(mode="before") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index cd85baa9ba661..16564214e353a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -993,7 +993,7 @@ class OpenAIServingChat(OpenAIServing): # check to make sure we haven't "forgotten" to stream # any tokens that were generated but previously # matched by partial json parsing - # only happens if we are NOT using guided decoding + # only happens if we are NOT using structured outputs auto_tools_called = False if tool_parser: auto_tools_called = len( diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 687af7a189cea..ce3d23763ed64 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -262,9 +262,9 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: - decoding_config = vllm_config.decoding_config - if decoding_config.reasoning_backend == "": - decoding_config.reasoning_backend = "openai_gptoss" + structured_outputs_config = vllm_config.structured_outputs_config + if structured_outputs_config.reasoning_parser == "": + structured_outputs_config.reasoning_parser = "openai_gptoss" # Increase the max capture size from 512 to 1024 for performance. # NOTE(woosuk): This will increase the number of CUDA graphs diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index fe93e906064e4..0a01cb0260ae5 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sampling parameters for text generation.""" import copy -from dataclasses import dataclass +from dataclasses import field from enum import Enum, IntEnum from functools import cached_property from typing import Annotated, Any, Optional, Union import msgspec -from pydantic import BaseModel +from pydantic.dataclasses import dataclass from vllm.logger import init_logger from vllm.logits_process import LogitsProcessor @@ -28,60 +28,35 @@ class SamplingType(IntEnum): # maybe make msgspec? @dataclass -class GuidedDecodingParams: - """One of these fields will be used to build a logit processor.""" +class StructuredOutputsParams: + # One of these fields will be used to build a logit processor. json: Optional[Union[str, dict]] = None regex: Optional[str] = None choice: Optional[list[str]] = None grammar: Optional[str] = None json_object: Optional[bool] = None - """These are other options that can be set""" - backend: Optional[str] = None - backend_was_auto: bool = False + # These are other options that can be set. disable_fallback: bool = False disable_any_whitespace: bool = False disable_additional_properties: bool = False whitespace_pattern: Optional[str] = None structural_tag: Optional[str] = None - @staticmethod - def from_optional( - json: Optional[Union[dict, BaseModel, str]] = None, - regex: Optional[str] = None, - choice: Optional[list[str]] = None, - grammar: Optional[str] = None, - json_object: Optional[bool] = None, - backend: Optional[str] = None, - whitespace_pattern: Optional[str] = None, - structural_tag: Optional[str] = None, - ) -> Optional["GuidedDecodingParams"]: - if all(arg is None for arg in (json, regex, choice, grammar, - json_object, structural_tag)): - return None - # Extract json schemas from pydantic models - if isinstance(json, (BaseModel, type(BaseModel))): - json = json.model_json_schema() - return GuidedDecodingParams( - json=json, - regex=regex, - choice=choice, - grammar=grammar, - json_object=json_object, - backend=backend, - whitespace_pattern=whitespace_pattern, - structural_tag=structural_tag, - ) + _backend: Optional[str] = field(default=None, init=False) + """CAUTION: Should only be set by Processor._validate_structured_output""" + _backend_was_auto: bool = field(default=False, init=False) + """CAUTION: Should only be set by Processor._validate_structured_output""" def __post_init__(self): """Validate that some fields are mutually exclusive.""" - guide_count = sum([ + count = sum([ self.json is not None, self.regex is not None, self.choice is not None, self.grammar is not None, self.json_object is not None ]) - if guide_count > 1: + if count > 1: raise ValueError( - "You can only use one kind of guided decoding but multiple are " - f"specified: {self.__dict__}") + "You can only use one kind of structured outputs constraint " + f"but multiple are specified: {self.__dict__}") class RequestOutputKind(Enum): @@ -196,9 +171,8 @@ class SamplingParams( _all_stop_token_ids: set[int] = msgspec.field(default_factory=set) # Fields used to construct logits processors - guided_decoding: Optional[GuidedDecodingParams] = None - """If provided, the engine will construct a guided decoding logits - processor from these parameters.""" + structured_outputs: Optional[StructuredOutputsParams] = None + """Parameters for configuring structured outputs.""" logit_bias: Optional[dict[int, float]] = None """If provided, the engine will construct a logits processor that applies these logit biases.""" @@ -246,7 +220,7 @@ class SamplingParams( msgspec.Meta( ge=-1)]] = None, output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, - guided_decoding: Optional[GuidedDecodingParams] = None, + structured_outputs: Optional[StructuredOutputsParams] = None, logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None, allowed_token_ids: Optional[list[int]] = None, extra_args: Optional[dict[str, Any]] = None, @@ -288,7 +262,7 @@ class SamplingParams( logits_processors=logits_processors, truncate_prompt_tokens=truncate_prompt_tokens, output_kind=output_kind, - guided_decoding=guided_decoding, + structured_outputs=structured_outputs, logit_bias=logit_bias, allowed_token_ids=allowed_token_ids, extra_args=extra_args, @@ -559,7 +533,7 @@ class SamplingParams( "spaces_between_special_tokens=" f"{self.spaces_between_special_tokens}, " f"truncate_prompt_tokens={self.truncate_prompt_tokens}, " - f"guided_decoding={self.guided_decoding}, " + f"structured_outputs={self.structured_outputs}, " f"extra_args={self.extra_args})") diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 5b07327cf2b81..d8a8d19391cd0 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -274,7 +274,7 @@ class MistralTokenizer(TokenizerBase): return tokenizer_file # the following attributes are set to fit vLLM's design and are used - # by the guided structured output backends. + # by the structured output backends. @property def all_special_tokens_extended(self) -> list[str]: from mistral_common.tokens.tokenizers.base import SpecialTokens @@ -463,9 +463,6 @@ class MistralTokenizer(TokenizerBase): return decoded - # WARN: Outlines logits processors can overwrite this method. - # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer - # for more. def decode(self, ids: Union[list[int], int], skip_special_tokens: bool = True) -> str: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index f17c269e4709e..73165c7e4c0ad 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -588,9 +588,6 @@ class AsyncLLM(EngineClient): async def get_model_config(self) -> ModelConfig: return self.model_config - async def get_decoding_config(self): - raise ValueError("Not Supported on V1 yet.") - async def get_input_preprocessor(self) -> InputPreprocessor: return self.processor.input_preprocessor diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 8d9f2ba1ec825..71f539583a1be 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -45,7 +45,7 @@ class Processor: self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config - self.decoding_config = vllm_config.decoding_config + self.structured_outputs_config = vllm_config.structured_outputs_config self.tokenizer = tokenizer self.generation_config_fields = ( @@ -219,58 +219,57 @@ class Processor: "[lora_path]` to use the LoRA tokenizer.") def _validate_structured_output(self, params: SamplingParams) -> None: - if not params.guided_decoding or not self.decoding_config: + if not params.structured_outputs or not self.structured_outputs_config: return - if self.model_config.skip_tokenizer_init and params.guided_decoding: + if self.model_config.skip_tokenizer_init and params.structured_outputs: raise ValueError( "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 ) - engine_level_backend = self.decoding_config.backend - if params.guided_decoding.backend: - # Request-level backend selection is not supported in V1. + backend = self.structured_outputs_config.backend + if _backend := params.structured_outputs._backend: + # Request-level backend selection is not supported. # The values may differ if `params` is reused and was set # to a specific backend based on `auto` behavior in a previous # request. We remember that it was set as a result of `auto` - # using the `_auto` option set on the backend in the params. - if (params.guided_decoding.backend != engine_level_backend - and not (engine_level_backend == "auto" - and params.guided_decoding.backend_was_auto)): + # using the `_backend_was_auto` field set in the params. + if (backend != _backend + and not (backend == "auto" + and params.structured_outputs._backend_was_auto)): raise ValueError( - "Request-level structured output backend selection is no " - "longer supported. The request specified " - f"'{params.guided_decoding.backend}', but vLLM was " - f"initialised with '{engine_level_backend}'. This error " - "can be resolved by removing backend selection from the " - "request.") + "Request-level structured output backend selection is not " + f"supported. The request specified '{_backend}', but vLLM " + f"was initialised with '{backend}'. This error can be " + "resolved by removing '_backend' from the request.") else: - params.guided_decoding.backend = engine_level_backend + params.structured_outputs._backend = backend # Request content validation - if (isinstance(params.guided_decoding.choice, list) - and not params.guided_decoding.choice): + if (isinstance(params.structured_outputs.choice, list) + and not params.structured_outputs.choice): # It is invalid for choice to be an empty list - raise ValueError(f"Choice '{params.guided_decoding.choice}' " - "cannot be an empty list") + raise ValueError( + f"Choice '{params.structured_outputs.choice}' cannot be an empty list" # noqa: E501 + ) - if engine_level_backend.startswith("xgrammar"): + if backend.startswith("xgrammar"): # xgrammar with no fallback validate_xgrammar_grammar(params) - elif engine_level_backend.startswith("guidance"): + elif backend.startswith("guidance"): # TODO: ideally we would have the LLTokenizer here as Lark syntax # allows <|special_token|> and similar, see # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens # Without tokenizer these are disallowed in grammars. validate_guidance_grammar(params, tokenizer=None) - elif engine_level_backend == "outlines": + elif backend == "outlines": # outlines backend validate_structured_output_request_outlines(params) - elif engine_level_backend == "lm-format-enforcer": + elif backend == "lm-format-enforcer": # lm format enforcer backend validate_structured_output_request_lm_format_enforcer(params) else: - # NOTE: engine_level_backend must be "auto" here, because we have + # NOTE: backend must be "auto" here, because we have # checked supported_backends above. # In this mode, we set opinionated defaults based on what we think # will satisfy the most use cases without having to worry about @@ -278,15 +277,15 @@ class Processor: # other setting where a specific backend was specified. try: validate_xgrammar_grammar(params) - params.guided_decoding.backend = "xgrammar" + params.structured_outputs._backend = "xgrammar" except ValueError: # The request either failed validation # or includes some jsonschema feature(s) that # are not supported in xgrammar. Fall back to guidance. validate_guidance_grammar(params, tokenizer=None) - params.guided_decoding.backend = "guidance" + params.structured_outputs._backend = "guidance" # Remember that this backend was set automatically - params.guided_decoding.backend_was_auto = True + params.structured_outputs._backend_was_auto = True def _maybe_build_mm_uuids( self, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 4e3e581235cce..145af788d2372 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -67,7 +67,7 @@ class Request: # Generative models. assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens - if sampling_params.guided_decoding is not None: + if sampling_params.structured_outputs is not None: self.status = RequestStatus.WAITING_FOR_FSM self.use_structured_output = True diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 1ab29dfecd9e4..13c33d3edf141 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -61,11 +61,11 @@ class StructuredOutputManager: self.executor = ThreadPoolExecutor(max_workers=max_workers) self.tokenizer = init_tokenizer_from_configs( model_config=self.vllm_config.model_config) - reasoning_backend = \ - self.vllm_config.decoding_config.reasoning_backend - if reasoning_backend: + reasoning_parser = \ + self.vllm_config.structured_outputs_config.reasoning_parser + if reasoning_parser: reasoner_cls = ReasoningParserManager.get_reasoning_parser( - reasoning_backend) + reasoning_parser) self.reasoner = reasoner_cls(tokenizer=self.tokenizer) def grammar_init(self, request: Request) -> None: @@ -74,15 +74,16 @@ class StructuredOutputManager: if TYPE_CHECKING: assert request.sampling_params is not None and \ - request.sampling_params.guided_decoding is not None + request.sampling_params.structured_outputs is not None # Initialize the backend the first time it is needed. # # NOTE: We only support a single backend. We do NOT support different # backends on a per-request basis in V1 (for now, anyway...). + # _backend is set in Processor._validate_structured_output if self.backend is None: assert request.sampling_params is not None - backend = request.sampling_params.guided_decoding.backend + backend = request.sampling_params.structured_outputs._backend vocab_size = self.vllm_config.model_config.get_vocab_size() if backend == "xgrammar": self.backend = XgrammarBackend( diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 02e7fc33f517d..e06ab6377de3a 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -60,9 +60,9 @@ class GuidanceBackend(StructuredOutputBackend): def __post_init__(self): self.disable_any_whitespace = \ - self.vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.structured_outputs_config.disable_any_whitespace self.disable_additional_properties = \ - self.vllm_config.decoding_config.disable_additional_properties + self.vllm_config.structured_outputs_config.disable_additional_properties self.ll_tokenizer = llguidance_hf.from_tokenizer( self.tokenizer, self.vocab_size) diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py index 2279a1c8c8a00..465b2428f8938 100644 --- a/vllm/v1/structured_output/backend_lm_format_enforcer.py +++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py @@ -138,30 +138,30 @@ class LMFormatEnforcerBackend(StructuredOutputBackend): def validate_structured_output_request_lm_format_enforcer( params: SamplingParams): - if params.guided_decoding is None: + if params.structured_outputs is None: return - gd_params = params.guided_decoding + so_params = params.structured_outputs - if gd_params.regex: + if so_params.regex: return - elif gd_params.json: - if isinstance(gd_params.json, str): + elif so_params.json: + if isinstance(so_params.json, str): try: # make sure schema is valid json - json.loads(gd_params.json) + json.loads(so_params.json) except json.JSONDecodeError as e: raise ValueError("Invalid JSON grammar specification.") from e else: try: - json.dumps(gd_params.json) + json.dumps(so_params.json) except Exception as e: raise ValueError( - f"Error serializing guided decoding jsonschema: {e}" + f"Error serializing structured outputs jsonschema: {e}" ) from e return - elif gd_params.choice: + elif so_params.choice: return - elif gd_params.grammar: - raise ValueError("LM Format Enforcer guided decoding backend " + elif so_params.grammar: + raise ValueError("LM Format Enforcer structured outputs backend " "does not support grammar specifications") diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py index 572e4984480fa..e5e638a6ad764 100644 --- a/vllm/v1/structured_output/backend_outlines.py +++ b/vllm/v1/structured_output/backend_outlines.py @@ -158,36 +158,36 @@ class OutlinesGrammar(StructuredOutputGrammar): def validate_structured_output_request_outlines(params: SamplingParams): - if params.guided_decoding is None: + if params.structured_outputs is None: return - gd_params = params.guided_decoding + so_params = params.structured_outputs - if gd_params.regex: - validate_regex_is_buildable(gd_params.regex) - elif gd_params.json: - if isinstance(gd_params.json, str): + if so_params.regex: + validate_regex_is_buildable(so_params.regex) + elif so_params.json: + if isinstance(so_params.json, str): try: # make sure schema is valid json - json.loads(gd_params.json) - schema = gd_params.json + json.loads(so_params.json) + schema = so_params.json except json.JSONDecodeError as e: raise ValueError("Invalid JSON grammar specification.") from e else: try: - schema = json.dumps(gd_params.json) + schema = json.dumps(so_params.json) except Exception as e: raise ValueError( - f"Error serializing guided decoding jsonschema: {e}" + f"Error serializing structured outputs jsonschema: {e}" ) from e pattern = json_schema.build_regex_from_schema(schema) validate_regex_is_buildable(pattern) - elif gd_params.choice: - choices = [regex_escape(str(choice)) for choice in gd_params.choice] + elif so_params.choice: + choices = [regex_escape(str(choice)) for choice in so_params.choice] regex = "(" + "|".join(choices) + ")" validate_regex_is_buildable(regex) - elif gd_params.grammar: - raise ValueError("Outlines guided decoding backend " + elif so_params.grammar: + raise ValueError("Outlines structured outputs backend " "does not support grammar specifications") @@ -306,7 +306,7 @@ def validate_regex_is_buildable(pattern: str) -> None: _check_unsupported(parsed) except ValueError as e: raise ValueError( - f"Regex uses unsupported feature for guided decoding: {e}. " + f"Regex uses unsupported feature for structured outputs: {e}. " "Only basic matching constructs are supported—lookarounds, " "backreferences, and unicode boundaries are not.") from e @@ -315,6 +315,6 @@ def validate_regex_is_buildable(pattern: str) -> None: "Regex does not have a anchored universal start state" "This means that the Regex uses anchors (^) or look-arounds " "in a way which requires context before any token is matched." - "Guided decoding needs regexes that can match without needing " + "structured outputs needs regexes that can match without needing " "that context. Try rewriting the pattern without using these " f"constructs. Pattern:\n{pattern}") diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 5e00f63804162..55b4792fe010d 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -34,7 +34,7 @@ class XgrammarBackend(StructuredOutputBackend): def __post_init__(self): self.disable_any_whitespace = \ - self.vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.structured_outputs_config.disable_any_whitespace if isinstance(self.tokenizer, MistralTokenizer): # NOTE: ideally, xgrammar should handle this accordingly. @@ -248,37 +248,37 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: Raises ValueError if the request is not supported. """ - if sampling_params.guided_decoding is None: + if sampling_params.structured_outputs is None: return - gd_params = sampling_params.guided_decoding + so_params = sampling_params.structured_outputs - if gd_params.regex: + if so_params.regex: try: - xgr.Grammar.from_regex(gd_params.regex) + xgr.Grammar.from_regex(so_params.regex) except Exception as err: raise ValueError("Failed to transform regex into a grammar: " f"{err}") from err - if gd_params.choice: - choice_grammar = choice_as_grammar(gd_params.choice) + if so_params.choice: + choice_grammar = choice_as_grammar(so_params.choice) try: xgr.Grammar.from_ebnf(choice_grammar) except Exception as err: raise ValueError("Failed to transform choices into a grammar: " "{err}") from err - gd_params.choice = None - gd_params.grammar = choice_grammar + so_params.choice = None + so_params.grammar = choice_grammar return - if gd_params.json: - if isinstance(gd_params.json, str): + if so_params.json: + if isinstance(so_params.json, str): try: - schema = json.loads(gd_params.json) + schema = json.loads(so_params.json) except json.JSONDecodeError as e: raise ValueError("Invalid JSON grammar specification.") from e else: - schema = gd_params.json + schema = so_params.json try: xgr.Grammar.from_json_schema(schema) @@ -291,11 +291,11 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: "supported by xgrammar.") return - if gd_params.grammar: - if grammar_is_likely_lark(gd_params.grammar): + if so_params.grammar: + if grammar_is_likely_lark(so_params.grammar): # xgrammar supports EBNF grammars only try: - gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar) + so_params.grammar = convert_lark_to_ebnf(so_params.grammar) except ValueError as e: raise ValueError( "Failed to convert the grammar from Lark to EBNF. ") from e @@ -303,14 +303,14 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: # Test parsing EBNF grammar, possibly already converted from Lark try: # parse the grammar, but we aren't compiling it. - xgr.Grammar.from_ebnf(gd_params.grammar) + xgr.Grammar.from_ebnf(so_params.grammar) except Exception as e: raise ValueError("Invalid grammar specification.") from e return - if gd_params.structural_tag: + if so_params.structural_tag: try: - s_tag = json.loads(gd_params.structural_tag) + s_tag = json.loads(so_params.structural_tag) tags = [ xgr.StructuralTagItem( begin=s["begin"], diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index fc365f12573fc..99974ef46ecd5 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -60,7 +60,7 @@ class StructuredOutputRequest: def get_structured_output_key( sampling_params: SamplingParams) -> StructuredOutputKey: - params = sampling_params.guided_decoding + params = sampling_params.structured_outputs assert params is not None, "params can't be None." if params.json is not None: if not isinstance(params.json, str): From 4f02b77de4e794a0d417ed98a26884208f75e043 Mon Sep 17 00:00:00 2001 From: ihb2032 <40718643+ihb2032@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:43:23 +0800 Subject: [PATCH 22/58] Fix: Add explicit #include for OpenMP compatibility on certain toolchains (#24951) Signed-off-by: lyd1992 Signed-off-by: ihb2032 <1355790728@qq.com> --- csrc/cpu/cpu_types.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 17bbe04eef94a..c3a21796881c9 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -17,4 +17,8 @@ #warning "unsupported vLLM cpu implementation" #endif +#ifdef _OPENMP + #include +#endif + #endif \ No newline at end of file From abdfcd4f3dc21dc162baf6887f658fb0f2f3d783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elvir=20Crn=C4=8Devi=C4=87?= Date: Thu, 18 Sep 2025 12:25:12 +0200 Subject: [PATCH 23/58] silu-v1: Fix EPS not being used during max-reduction (#25069) Signed-off-by: elvircrn --- csrc/quantization/activation_kernels.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu index 9ddb5af3052fa..9aa1411b4a25c 100644 --- a/csrc/quantization/activation_kernels.cu +++ b/csrc/quantization/activation_kernels.cu @@ -365,7 +365,6 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel( int32_t compute_pipeline_offset_64 = 0; for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) { - __nv_bfloat16 y_max_bf16 = EPS; __nv_bfloat162 results_bf162[2]; cp_async_wait(); @@ -405,7 +404,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel( auto _y_max2 = __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1])); - y_max_bf16 = __hmax(_y_max2.x, _y_max2.y); + __nv_bfloat16 y_max_bf16 = __hmax(EPS, __hmax(_y_max2.x, _y_max2.y)); // An entire group is assigned to a single warp, so a simple warp reduce // is used. From cc935fdd7e0c466cd556b6515e435dddd78677e0 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Thu, 18 Sep 2025 18:34:42 +0800 Subject: [PATCH 24/58] [Frontend] Support setting logprobs to -1 (#25031) Signed-off-by: chaunceyjiang --- tests/entrypoints/openai/test_chat_echo.py | 23 ++++++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 8 +++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py index 0f459dd3d8574..ce965eb829248 100644 --- a/tests/entrypoints/openai/test_chat_echo.py +++ b/tests/entrypoints/openai/test_chat_echo.py @@ -99,3 +99,26 @@ async def test_prompt_logprobs(client: openai.AsyncOpenAI): assert completion.prompt_logprobs is not None assert len(completion.prompt_logprobs) > 0 + + +@pytest.mark.asyncio +async def test_top_logprobs(client: openai.AsyncOpenAI): + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "Beijing is the capital of which country?" + }] + + completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + extra_body={ + "top_logprobs": -1, + "logprobs": "true", + }, + ) + assert completion.choices[0].logprobs is not None + assert completion.choices[0].logprobs.content is not None + assert len(completion.choices[0].logprobs.content) > 0 diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index cff4a45fdc43e..7ad8e73d89d59 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -832,10 +832,12 @@ class ChatCompletionRequest(OpenAIBaseModel): raise ValueError("`prompt_logprobs=-1` is only supported with " "vLLM engine V1.") if (top_logprobs := data.get("top_logprobs")) is not None: - if top_logprobs < 0: - raise ValueError("`top_logprobs` must be a positive value.") + if top_logprobs < 0 and top_logprobs != -1: + raise ValueError( + "`top_logprobs` must be a positive value or -1.") - if top_logprobs > 0 and not data.get("logprobs"): + if (top_logprobs == -1 + or top_logprobs > 0) and not data.get("logprobs"): raise ValueError( "when using `top_logprobs`, `logprobs` must be set to true." ) From 37970105fed95d58677f0a4635cb253a71e8817c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 18 Sep 2025 19:04:21 +0800 Subject: [PATCH 25/58] [Model] Improve Pooling Model (#25149) Signed-off-by: Jee Jee Li --- vllm/model_executor/layers/pooler.py | 12 ++++++------ vllm/v1/worker/gpu_model_runner.py | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index b571a8f866990..4a97438b1bb2c 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -12,8 +12,9 @@ import torch.nn as nn import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.config import ModelConfig, PoolerConfig +from vllm.config import ModelConfig, PoolerConfig, get_current_vllm_config from vllm.logger import init_logger +from vllm.model_executor.models.adapters import _load_st_projector from vllm.pooling_params import PoolingParams from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput from vllm.tasks import PoolingTask @@ -377,7 +378,6 @@ class PoolerClassify(PoolerActivation): super().__init__() if static_num_labels: - from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() self.num_labels = getattr(vllm_config.model_config.hf_config, "num_labels", 0) @@ -427,8 +427,6 @@ class EmbeddingPoolerHead(PoolerHead): super().__init__(activation=PoolerNormalize()) # Load ST projector if available - from vllm.config import get_current_vllm_config - from vllm.model_executor.models.adapters import _load_st_projector vllm_config = get_current_vllm_config() self.projector: Optional[nn.Module] = _load_st_projector( @@ -489,7 +487,6 @@ class RewardPoolerHead(PoolerHead): def __init__(self) -> None: super().__init__(activation=PoolerClassify(static_num_labels=False)) - from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() self.head_dtype = vllm_config.model_config.head_dtype @@ -638,7 +635,6 @@ class ClassifierPooler(Pooler): ) -> None: super().__init__() - from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() self.pooling = pooling @@ -730,3 +726,7 @@ class DispatchPooler(Pooler): offset += num_items return PoolerOutput(outputs) + + def extra_repr(self) -> str: + s = f"supported_task={self.get_supported_tasks()}" + return s diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e8ad9c2fca07c..2e67984cb4327 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3151,6 +3151,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model = cast(VllmModelForPooling, self.get_model()) dummy_pooling_params = PoolingParams(task=task) + dummy_pooling_params.verify(task=task, model_config=self.model_config) to_update = model.pooler.get_pooling_updates(task) to_update.apply(dummy_pooling_params) From 8ed039d52775aaee4a61663dd5d8c840f5eebd15 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:24:27 +0100 Subject: [PATCH 26/58] Move `StructuredOutputsConfig` from `config/__init__.py` to `config/structured_outputs.py` (#25153) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 61 +---------------------------- vllm/config/structured_outputs.py | 64 +++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 60 deletions(-) create mode 100644 vllm/config/structured_outputs.py diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 9a1c5f0b0d453..69ab5712d404c 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -42,6 +42,7 @@ from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.speculative import SpeculativeConfig +from vllm.config.structured_outputs import StructuredOutputsConfig from vllm.config.utils import ConfigType, config from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationMethods @@ -2277,66 +2278,6 @@ def get_served_model_name(model: str, return served_model_name -StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines", - "lm-format-enforcer"] - - -@config -@dataclass -class StructuredOutputsConfig: - """Dataclass which contains structured outputs config for the engine.""" - - backend: StructuredOutputsBackend = "auto" - """Which engine will be used for structured outputs (e.g. JSON schema, - regex, etc) by default. With "auto", we will make opinionated choices - based on request contents and what the backend libraries currently support, - so the behavior is subject to change in each release.""" - - disable_fallback: bool = False - """If `True`, vLLM will not fallback to a different backend on error.""" - - disable_any_whitespace: bool = False - """If `True`, the model will not generate any whitespace during structured - outputs. This is only supported for xgrammar and guidance backends.""" - - disable_additional_properties: bool = False - """If `True`, the `guidance` backend will not use `additionalProperties` - in the JSON schema. This is only supported for the `guidance` backend and - is used to better align its behaviour with `outlines` and `xgrammar`.""" - - reasoning_parser: str = "" - """Select the reasoning parser depending on the model that you're using. - This is used to parse the reasoning content into OpenAI API format.""" - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - # no factors to consider. - # this config will not affect the computation graph. - factors: list[Any] = [] - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def __post_init__(self): - if (self.disable_any_whitespace - and self.backend not in ("xgrammar", "guidance")): - raise ValueError("disable_any_whitespace is only supported for " - "xgrammar and guidance backends.") - if (self.disable_additional_properties and self.backend != "guidance"): - raise ValueError("disable_additional_properties is only supported " - "for the guidance backend.") - - DetailedTraceModules = Literal["model", "worker", "all"] diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py new file mode 100644 index 0000000000000..b1f14294510f8 --- /dev/null +++ b/vllm/config/structured_outputs.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import Any, Literal + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + +StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines", + "lm-format-enforcer"] + + +@config +@dataclass +class StructuredOutputsConfig: + """Dataclass which contains structured outputs config for the engine.""" + + backend: StructuredOutputsBackend = "auto" + """Which engine will be used for structured outputs (e.g. JSON schema, + regex, etc) by default. With "auto", we will make opinionated choices + based on request contents and what the backend libraries currently support, + so the behavior is subject to change in each release.""" + disable_fallback: bool = False + """If `True`, vLLM will not fallback to a different backend on error.""" + disable_any_whitespace: bool = False + """If `True`, the model will not generate any whitespace during structured + outputs. This is only supported for xgrammar and guidance backends.""" + disable_additional_properties: bool = False + """If `True`, the `guidance` backend will not use `additionalProperties` + in the JSON schema. This is only supported for the `guidance` backend and + is used to better align its behaviour with `outlines` and `xgrammar`.""" + reasoning_parser: str = "" + """Select the reasoning parser depending on the model that you're using. + This is used to parse the reasoning content into OpenAI API format.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self): + if (self.disable_any_whitespace + and self.backend not in ("xgrammar", "guidance")): + raise ValueError("disable_any_whitespace is only supported for " + "xgrammar and guidance backends.") + if (self.disable_additional_properties and self.backend != "guidance"): + raise ValueError("disable_additional_properties is only supported " + "for the guidance backend.") From eaffe4486cb1d7edf884e6e254cab33fc397e308 Mon Sep 17 00:00:00 2001 From: Kay Yan Date: Thu, 18 Sep 2025 19:36:47 +0800 Subject: [PATCH 27/58] [Docs] Fix pooling-params doc references in openai_compatible_server.md (#24939) --- docs/api/README.md | 1 - docs/serving/openai_compatible_server.md | 20 ++++++++++++-------- vllm/pooling_params.py | 20 ++++++++++++++------ 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/docs/api/README.md b/docs/api/README.md index 148211756480c..86e310f567dd3 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -46,7 +46,6 @@ Engine classes for offline and online inference. Inference parameters for vLLM APIs. [](){ #sampling-params } -[](){ #pooling-params } - [vllm.SamplingParams][] - [vllm.PoolingParams][] diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index bc52d02a50bd2..bac3f6c1fe90c 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -317,10 +317,11 @@ Full example: Date: Thu, 18 Sep 2025 20:37:08 +0900 Subject: [PATCH 28/58] [Docs] add the parallel sampling usage in LLMEngine and AsyncLLM (#24222) --- vllm/sampling_params.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 0a01cb0260ae5..efe70d019ccc6 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -81,7 +81,13 @@ class SamplingParams( """ n: int = 1 - """Number of output sequences to return for the given prompt.""" + """Number of outputs to return for the given prompt request. + + NOTE: + `AsyncLLM` streams outputs by default. When `n > 1`, all `n` outputs + are generated and streamed cumulatively per request. To see all `n` + outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY` + in `SamplingParams`.""" best_of: Optional[int] = None """Number of output sequences that are generated from the prompt. From these `best_of` sequences, the top `n` sequences are returned. `best_of` From 5a33ae9a3faae79cad9d2659862fcd8d86483659 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:41:41 +0100 Subject: [PATCH 29/58] Fix forward reference warning in documentation (#25150) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/async_timeout.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py index 28a023a71ef52..3b9c055160c1b 100644 --- a/vllm/engine/async_timeout.py +++ b/vllm/engine/async_timeout.py @@ -16,19 +16,6 @@ if sys.version_info[:2] >= (3, 11): from asyncio import timeout as asyncio_timeout else: - def asyncio_timeout(delay: Optional[float]) -> "Timeout": - """timeout context manager. - Useful in cases when you want to apply timeout logic around block - of code or in cases when asyncio.wait_for is not suitable. For example: - >>> async with timeout(0.001): - ... async with aiohttp.get('https://github.com') as r: - ... await r.text() - delay - value in seconds or None to disable timeout logic - """ - loop = asyncio.get_running_loop() - deadline = loop.time() + delay if delay is not None else None - return Timeout(deadline, loop) - class _State(enum.Enum): INIT = "INIT" ENTER = "ENTER" @@ -171,3 +158,16 @@ else: self._state = _State.TIMEOUT # drop the reference early self._timeout_handler = None + + def asyncio_timeout(delay: Optional[float]) -> Timeout: + """timeout context manager. + Useful in cases when you want to apply timeout logic around block + of code or in cases when asyncio.wait_for is not suitable. For example: + >>> async with timeout(0.001): + ... async with aiohttp.get('https://github.com') as r: + ... await r.text() + delay - value in seconds or None to disable timeout logic + """ + loop = asyncio.get_running_loop() + deadline = loop.time() + delay if delay is not None else None + return Timeout(deadline, loop) From 3ed1ec4af25a9cb7dcfea74b839864fc3c8ba09d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 13:06:28 +0100 Subject: [PATCH 30/58] Fix `validate-config` pre-commit check (#25157) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 4 +--- tools/validate_config.py | 23 ++++++++++++++++------- vllm/config/__init__.py | 2 ++ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c16bdeeecd07a..13ad3af97d839 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -164,9 +164,7 @@ repos: name: Validate configuration has default values and that each field has a docstring entry: python tools/validate_config.py language: python - types: [python] - pass_filenames: true - files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py + additional_dependencies: [regex] # Keep `suggestion` last - id: suggestion name: Suggestion diff --git a/tools/validate_config.py b/tools/validate_config.py index 8b1e955c653d7..f6439fa9ada5f 100644 --- a/tools/validate_config.py +++ b/tools/validate_config.py @@ -9,6 +9,8 @@ import ast import inspect import sys +import regex as re + def get_attr_docs(cls_node: ast.ClassDef) -> dict[str, str]: """ @@ -88,11 +90,12 @@ def validate_class(class_node: ast.ClassDef): for stmt in class_node.body: # A field is defined as a class variable that has a type annotation. if isinstance(stmt, ast.AnnAssign): - # Skip ClassVar + # Skip ClassVar and InitVar # see https://docs.python.org/3/library/dataclasses.html#class-variables - if isinstance(stmt.annotation, ast.Subscript) and isinstance( - stmt.annotation.value, - ast.Name) and stmt.annotation.value.id == "ClassVar": + # and https://docs.python.org/3/library/dataclasses.html#init-only-variables + if (isinstance(stmt.annotation, ast.Subscript) + and isinstance(stmt.annotation.value, ast.Name) + and stmt.annotation.value.id in {"ClassVar", "InitVar"}): continue if isinstance(stmt.target, ast.Name): @@ -132,7 +135,7 @@ def validate_ast(tree: ast.stmt): def validate_file(file_path: str): try: - print(f"validating {file_path} config dataclasses ", end="") + print(f"Validating {file_path} config dataclasses ", end="") with open(file_path, encoding="utf-8") as f: source = f.read() @@ -140,7 +143,7 @@ def validate_file(file_path: str): validate_ast(tree) except ValueError as e: print(e) - SystemExit(2) + raise SystemExit(1) from e else: print("✅") @@ -151,7 +154,13 @@ def fail(message: str, node: ast.stmt): def main(): for filename in sys.argv[1:]: - validate_file(filename) + # Only run for Python files in vllm/ or tests/ + if not re.match(r"^(vllm|tests)/.*\.py$", filename): + continue + # Only run if the file contains @config + with open(filename, encoding="utf-8") as f: + if "@config" in f.read(): + validate_file(filename) if __name__ == "__main__": diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 69ab5712d404c..25daca00c02d9 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -450,6 +450,8 @@ class ModelConfig: # Multimodal config and init vars multimodal_config: Optional[MultiModalConfig] = None + """Configuration for multimodal model. If `None`, this will be inferred + from the architecture of `self.model`.""" limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None From 66072b36dbf1707440ff43d57273d9e9974349d7 Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:21:17 +0300 Subject: [PATCH 31/58] [Bugfix][Mamba] - Fix Conv State Kernel FP32 Support (#24883) Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> --- tests/models/language/generation/test_hybrid.py | 9 ++++++--- vllm/model_executor/layers/mamba/ops/causal_conv1d.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index d0e42062099ec..206ad1352e06e 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -418,7 +418,9 @@ def test_full_cuda_graph( @pytest.mark.parametrize("model", FP32_STATE_MODELS) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_fp32_state( +@pytest.mark.parametrize("cache_dtype_param", + ["mamba_ssm_cache_dtype", "mamba_cache_dtype"]) +def test_fp32_cache_state( hf_runner, vllm_runner, example_prompts, @@ -426,6 +428,7 @@ def test_fp32_state( model: str, max_tokens: int, num_logprobs: int, + cache_dtype_param: str, ) -> None: try: @@ -443,13 +446,13 @@ def test_fp32_state( m.setenv("VLLM_USE_V1", "0") with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, - mamba_ssm_cache_dtype="float32") as vllm_model: + **{cache_dtype_param: "float32"}) as vllm_model: vllm_v0_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, - mamba_ssm_cache_dtype="float32") as vllm_model: + **{cache_dtype_param: "float32"}) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index 2a88fa661da01..8cfd0962c5bfe 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -415,6 +415,9 @@ def causal_conv1d_fn( activation = "silu" args = None + # Store original dtype to cast back at the end + original_x_dtype = x.dtype + x = x.to(conv_states.dtype) out = torch.empty_like(x) if metadata is not None: cu_seqlen = metadata.cu_seqlen @@ -613,7 +616,7 @@ def causal_conv1d_fn( BLOCK_N=256, num_stages=2, ) - return out + return out.to(original_x_dtype) @triton.jit() @@ -973,6 +976,9 @@ def causal_conv1d_update( activation = "silu" if activation is True else None elif activation is not None: assert activation in ["silu", "swish"] + + original_x_dtype = x.dtype + x = x.to(conv_state.dtype) unsqueeze = query_start_loc is None and x.dim() == 2 if unsqueeze: # make it (batch, dim, seqlen) with seqlen == 1 @@ -1081,4 +1087,4 @@ def causal_conv1d_update( ) if unsqueeze: out = out.squeeze(-1) - return out + return out.to(original_x_dtype) From 21da73343ad35f756e053ba4155dafb05229b0c5 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 18 Sep 2025 05:43:33 -0700 Subject: [PATCH 32/58] [Misc] Clean up flags in `vllm bench serve` (#25138) Signed-off-by: Roger Wang --- docs/contributing/benchmarks.md | 3 -- tests/benchmarks/test_serve_cli.py | 2 +- vllm/benchmarks/datasets.py | 8 ++--- vllm/benchmarks/serve.py | 49 +++++++++++++++++++++--------- 4 files changed, 39 insertions(+), 23 deletions(-) diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index d04b1d1136a1c..2a03ce1dffd63 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -156,7 +156,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -230,7 +229,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ @@ -245,7 +243,6 @@ vllm bench serve \ ```bash vllm bench serve \ --backend openai-chat \ - --endpoint-type openai-chat \ --model Qwen/Qwen2-VL-7B-Instruct \ --endpoint /v1/chat/completions \ --dataset-name hf \ diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index 5471d6b8e4a5f..fafbef5f37180 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -68,7 +68,7 @@ def test_bench_serve_chat(server): "5", "--endpoint", "/v1/chat/completions", - "--endpoint-type", + "--backend", "openai-chat", ] result = subprocess.run(command, capture_output=True, text=True) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 1cab40802c392..68a937d5750ec 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1358,7 +1358,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: elif args.dataset_name == "sonnet": dataset = SonnetDataset(dataset_path=args.dataset_path) # For the "sonnet" dataset, formatting depends on the backend. - if args.endpoint_type == "openai-chat": + if args.backend == "openai-chat": input_requests = dataset.sample( num_requests=args.num_prompts, input_len=args.sonnet_input_len, @@ -1462,7 +1462,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: "Please consider contributing if you would " "like to add support for additional dataset formats.") - if dataset_class.IS_MULTIMODAL and args.endpoint_type not in [ + if dataset_class.IS_MULTIMODAL and args.backend not in [ "openai-chat", "openai-audio", ]: @@ -1470,7 +1470,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: # endpoint-type. raise ValueError( "Multi-modal content is only supported on 'openai-chat' and " - "'openai-audio' endpoint-type.") + "'openai-audio' backends.") input_requests = dataset_class( dataset_path=args.dataset_path, dataset_subset=args.hf_subset, @@ -1563,7 +1563,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: try: # Enforce endpoint compatibility for multimodal datasets. - if args.dataset_name == "random-mm" and args.endpoint_type not in [ + if args.dataset_name == "random-mm" and args.backend not in [ "openai-chat"]: raise ValueError( "Multi-modal content (images) is only supported on " diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index d8784340eba15..7382782f11655 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -8,8 +8,8 @@ to launch the vLLM OpenAI API server: On the client side, run: vllm bench serve \ - --endpoint-type \ - --label \ + --backend \ + --label \ --model \ --dataset-name \ --request-rate \ @@ -52,6 +52,21 @@ TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None) and (shutil.which("gnuplot") is not None)) +# TODO: Remove this in v0.11.0 +class DeprecatedEndpointTypeAction(argparse.Action): + """Argparse action for the deprecated --endpoint-type flag. + """ + + def __call__(self, _, namespace, values, option_string=None): + warnings.warn( + "'--endpoint-type' is deprecated and will be removed in v0.11.0. " + "Please use '--backend' instead or remove this argument if you " + "have already set it.", + stacklevel=1, + ) + setattr(namespace, self.dest, values) + + class TaskType(Enum): GENERATION = "generation" EMBEDDING = "embedding" @@ -470,7 +485,7 @@ async def benchmark( else: request_func = ASYNC_REQUEST_FUNCS[endpoint_type] else: - raise ValueError(f"Unknown endpoint_type: {endpoint_type}") + raise ValueError(f"Unknown backend: {endpoint_type}") # Reuses connections across requests to reduce TLS handshake overhead. connector = aiohttp.TCPConnector( @@ -850,24 +865,28 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, def add_cli_args(parser: argparse.ArgumentParser): add_dataset_parser(parser) - parser.add_argument( - "--endpoint-type", - type=str, - default="openai", - choices=list(ASYNC_REQUEST_FUNCS.keys()), - ) parser.add_argument( "--label", type=str, default=None, help="The label (prefix) of the benchmark results. If not specified, " - "the endpoint type will be used as the label.", + "the value of '--backend' will be used as the label.", ) parser.add_argument( "--backend", type=str, - default="vllm", + default="openai", choices=list(ASYNC_REQUEST_FUNCS.keys()), + help="The type of backend or endpoint to use for the benchmark." + ) + parser.add_argument( + "--endpoint-type", + type=str, + default=None, + choices=list(ASYNC_REQUEST_FUNCS.keys()), + action=DeprecatedEndpointTypeAction, + help="'--endpoint-type' is deprecated and will be removed in v0.11.0. " + "Please use '--backend' instead.", ) parser.add_argument( "--base-url", @@ -1165,7 +1184,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: raise ValueError( "For exponential ramp-up, the start RPS cannot be 0.") - endpoint_type = args.endpoint_type label = args.label model_id = args.model model_name = args.served_model_name @@ -1228,7 +1246,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: gc.freeze() benchmark_result = await benchmark( - endpoint_type=args.endpoint_type, + endpoint_type=args.backend, api_url=api_url, base_url=base_url, model_id=model_id, @@ -1262,7 +1280,8 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") result_json["date"] = current_dt - result_json["endpoint_type"] = args.endpoint_type + result_json["endpoint_type"] = args.backend # for backward compatibility + result_json["backend"] = args.backend result_json["label"] = label result_json["model_id"] = model_id result_json["tokenizer_id"] = tokenizer_id @@ -1312,7 +1331,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: base_model_id = model_id.split("/")[-1] max_concurrency_str = (f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else "") - label = label or endpoint_type + label = label or args.backend if args.ramp_up_strategy is not None: file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa else: From 470484a4f503d4768008c2f5a8dc828dc90633b4 Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Thu, 18 Sep 2025 20:44:31 +0800 Subject: [PATCH 33/58] [Structured Output][Refactor] Move `apply_grammar_bitmask()` method from `ModelRunner` to structured output utils (#21999) Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm/v1/structured_output/utils.py | 80 ++++++++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 75 ++-------------------------- 2 files changed, 84 insertions(+), 71 deletions(-) diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 953185a8fc31d..127c8876525b5 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -8,7 +8,9 @@ import importlib.metadata import os from typing import TYPE_CHECKING +import numpy as np import regex as re +import torch from cachetools import LRUCache from diskcache import Cache @@ -20,9 +22,13 @@ if TYPE_CHECKING: import outlines_core as oc import transformers.file_utils as file_utils import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2 + import xgrammar as xgr from vllm.transformers_utils.tokenizer import AnyTokenizer + from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.worker.gpu_input_batch import InputBatch else: + xgr = LazyLoader("xgr", globals(), "xgrammar") oc = LazyLoader("oc", globals(), "outlines_core") file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils") tokenization_gpt2 = LazyLoader( @@ -36,6 +42,80 @@ logger = init_logger(__name__) CACHE = None +def apply_grammar_bitmask( + scheduler_output: SchedulerOutput, + input_batch: InputBatch, + logits: torch.Tensor, + device: torch.device, +) -> None: + """ + Apply grammar bitmask to output logits of the model with xgrammar function. + + Args: + scheduler_output (SchedulerOutput): The result of engine scheduling. + input_batch (InputBatch): The input of model runner. + logits (torch.Tensor): The output logits of model forward. + device (torch.device): The device that model runner running on. + """ + grammar_bitmask = scheduler_output.grammar_bitmask + if grammar_bitmask is None: + return + + # We receive the structured output bitmask from the scheduler, + # compacted to contain bitmasks only for structured output requests. + # The order of the requests in the bitmask is not guaranteed to be the + # same as the order of the requests in the gpu runner's batch. We need + # to sort the bitmask to match the order of the requests used here. + + # Get the batch indices of the structured output requests. + # Keep track of the number of speculative tokens scheduled for every + # request in the batch, as the logit indices are offset by this amount. + struct_out_req_batch_indices: dict[str, int] = {} + cumulative_offset = 0 + seq = sorted(input_batch.req_id_to_index.items(), key=lambda x: x[1]) + for req_id, batch_index in seq: + logit_index = batch_index + cumulative_offset + cumulative_offset += len( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + if req_id in scheduler_output.structured_output_request_ids: + struct_out_req_batch_indices[req_id] = logit_index + + out_indices = [] + + # Reorder the bitmask to match the order of the requests in the batch. + sorted_bitmask = np.full(shape=(logits.shape[0], grammar_bitmask.shape[1]), + fill_value=-1, + dtype=grammar_bitmask.dtype) + cumulative_index = 0 + seq = sorted(scheduler_output.structured_output_request_ids.items(), + key=lambda x: x[1]) + for req_id, _ in seq: + logit_index = struct_out_req_batch_indices[req_id] + num_spec_tokens = len( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + for i in range(1 + num_spec_tokens): + sorted_bitmask[logit_index + i] = \ + grammar_bitmask[cumulative_index + i] + out_indices.append(logit_index + i) + cumulative_index += 1 + num_spec_tokens + grammar_bitmask = sorted_bitmask + + # If the length of out indices and the logits have the same shape + # we don't need to pass indices to the kernel, + # since the bitmask is already aligned with the logits. + skip_out_indices = len(out_indices) == logits.shape[0] + + # Serialization of np.ndarray is much more efficient than a tensor, + # so we receive it in that format. + grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous() + + xgr.apply_token_bitmask_inplace( + logits, + grammar_bitmask.to(device, non_blocking=True), + indices=out_indices if not skip_out_indices else None, + ) + + class OutlinesVocabulary: """ Wrapper class for `outlines_core.Vocabulary`, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2e67984cb4327..4873b586724ec 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -54,7 +54,7 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size, + GiB_bytes, check_use_alibi, get_dtype_size, is_pin_memory_available, round_up, supports_dynamo) from vllm.v1.attention.backends.flash_attn import AttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder @@ -85,6 +85,7 @@ from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.spec_decode.medusa import MedusaProposer from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer +from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper @@ -101,12 +102,8 @@ from .utils import (AttentionGroup, MultiModalBudget, scatter_mm_placeholders) if TYPE_CHECKING: - import xgrammar as xgr - from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput -else: - xgr = LazyLoader("xgr", globals(), "xgrammar") logger = init_logger(__name__) @@ -1617,71 +1614,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return tuple(tasks) - def apply_grammar_bitmask( - self, - scheduler_output: "SchedulerOutput", - logits: torch.Tensor, - ): - grammar_bitmask = scheduler_output.grammar_bitmask - if grammar_bitmask is None: - return - - # We receive the structured output bitmask from the scheduler, - # compacted to contain bitmasks only for structured output requests. - # The order of the requests in the bitmask is not guaranteed to be the - # same as the order of the requests in the gpu runner's batch. We need - # to sort the bitmask to match the order of the requests used here. - - # Get the batch indices of the structured output requests. - # Keep track of the number of speculative tokens scheduled for every - # request in the batch, as the logit indices are offset by this amount. - struct_out_req_batch_indices: dict[str, int] = {} - cumulative_offset = 0 - seq = sorted(self.input_batch.req_id_to_index.items(), - key=lambda x: x[1]) - for req_id, batch_index in seq: - logit_index = batch_index + cumulative_offset - cumulative_offset += len( - scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) - if req_id in scheduler_output.structured_output_request_ids: - struct_out_req_batch_indices[req_id] = logit_index - - out_indices = [] - - # Reorder the bitmask to match the order of the requests in the batch. - sorted_bitmask = np.full(shape=(logits.shape[0], - grammar_bitmask.shape[1]), - fill_value=-1, - dtype=grammar_bitmask.dtype) - cumulative_index = 0 - seq = sorted(scheduler_output.structured_output_request_ids.items(), - key=lambda x: x[1]) - for req_id, _ in seq: - logit_index = struct_out_req_batch_indices[req_id] - num_spec_tokens = len( - scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) - for i in range(1 + num_spec_tokens): - sorted_bitmask[logit_index + i] = \ - grammar_bitmask[cumulative_index + i] - out_indices.append(logit_index + i) - cumulative_index += 1 + num_spec_tokens - grammar_bitmask = sorted_bitmask - - # If the length of out indices and the logits have the same shape - # we don't need to pass indices to the kernel, - # since the bitmask is already aligned with the logits. - skip_out_indices = len(out_indices) == logits.shape[0] - - # Serialization of np.ndarray is much more efficient than a tensor, - # so we receive it in that format. - grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous() - - xgr.apply_token_bitmask_inplace( - logits, - grammar_bitmask.to(self.device, non_blocking=True), - indices=out_indices if not skip_out_indices else None, - ) - def sync_and_slice_intermediate_tensors( self, num_tokens: int, intermediate_tensors: IntermediateTensors, sync_self: bool) -> IntermediateTensors: @@ -2232,7 +2164,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Apply structured output bitmasks if present if scheduler_output.grammar_bitmask is not None: - self.apply_grammar_bitmask(scheduler_output, logits) + apply_grammar_bitmask(scheduler_output, self.input_batch, + logits, self.device) with record_function_or_nullcontext("Sample"): sampler_output = self._sample(logits, spec_decode_metadata) From fbd6523ac00082c398dc8126434cede595169609 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 18 Sep 2025 08:53:45 -0400 Subject: [PATCH 34/58] Refactor dense FP8 tensor/channel/block utils and add CT FP8 block (#21404) --- vllm/model_executor/layers/linear.py | 14 +- .../compressed_tensors/compressed_tensors.py | 68 ++--- .../schemes/compressed_tensors_w8a8_fp8.py | 191 ++++++------- .../model_executor/layers/quantization/fp8.py | 265 ++++++------------ .../layers/quantization/utils/fp8_utils.py | 220 +++++++++++++++ 5 files changed, 441 insertions(+), 317 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index cd05136520977..5bf96398bc710 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -805,12 +805,10 @@ class MergedColumnParallelLinear(ColumnParallelLinear): assert loaded_shard_id < len(self.output_sizes) if isinstance(param, BlockQuantScaleParameter): - from vllm.model_executor.layers.quantization.fp8 import ( - Fp8LinearMethod, Fp8MoEMethod) assert self.quant_method is not None - assert isinstance(self.quant_method, - (Fp8LinearMethod, Fp8MoEMethod)) - weight_block_size = self.quant_method.quant_config.weight_block_size + # Assume the weight block size has been set by quant method + assert hasattr(self, "weight_block_size") + weight_block_size = self.weight_block_size assert weight_block_size is not None block_n, _ = weight_block_size[0], weight_block_size[1] shard_offset = ( @@ -989,8 +987,10 @@ class QKVParallelLinear(ColumnParallelLinear): # Note(simon): This is needed for Qwen3's fp8 quantization. if isinstance(param, BlockQuantScaleParameter): assert self.quant_method is not None - assert hasattr(self.quant_method, "quant_config") - weight_block_size = self.quant_method.quant_config.weight_block_size + # Assume the weight block size has been set by quant method + assert hasattr(self, "weight_block_size") + weight_block_size = self.weight_block_size + assert weight_block_size is not None block_n, _ = weight_block_size[0], weight_block_size[1] shard_offset = (shard_offset + block_n - 1) // block_n shard_size = (shard_size + block_n - 1) // block_n diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index b56a691311774..d6550dd16892f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -12,7 +12,6 @@ from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy, QuantizationType) from compressed_tensors.transform import TransformConfig -from pydantic import BaseModel import vllm.envs as envs from vllm.logger import init_logger @@ -268,7 +267,8 @@ class CompressedTensorsConfig(QuantizationConfig): else: return False - def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel): + def _is_fp4a4_nvfp4(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs): if weight_quant is None or input_quant is None: return False @@ -288,8 +288,8 @@ class CompressedTensorsConfig(QuantizationConfig): return (is_tensor_group_quant and is_float_type and is_4_bits and is_group_size_16 and is_symmetric) - def _is_fp4a16_nvfp4(self, weight_quant: BaseModel, - input_quant: BaseModel): + def _is_fp4a16_nvfp4(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs): is_weight_only = weight_quant is not None and input_quant is None is_tensor_group_quant = ( @@ -303,8 +303,8 @@ class CompressedTensorsConfig(QuantizationConfig): return (is_weight_only and is_tensor_group_quant and is_float_type and is_4_bits and is_group_size_16 and is_symmetric) - def _is_static_tensor_w8a8(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_static_tensor_w8a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 weight_strategy = ( weight_quant.strategy == QuantizationStrategy.TENSOR.value @@ -317,8 +317,8 @@ class CompressedTensorsConfig(QuantizationConfig): # Only symmetric weight quantization supported. return is_8_bits and is_tensor and weight_quant.symmetric and is_static - def _is_dynamic_token_w8a8(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_dynamic_token_w8a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 weight_strategy = ( weight_quant.strategy == QuantizationStrategy.TENSOR.value @@ -331,8 +331,8 @@ class CompressedTensorsConfig(QuantizationConfig): # Only symmetric weight quantization supported. return is_8_bits and is_token and weight_quant.symmetric and is_dynamic - def _is_dynamic_token_w4a8_int(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_dynamic_token_w4a8_int(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: is_weight_4_bits = weight_quant.num_bits == 4 is_activation_8_bits = input_quant.num_bits == 8 weight_strategy = ( @@ -347,8 +347,8 @@ class CompressedTensorsConfig(QuantizationConfig): return (is_weight_4_bits and is_activation_8_bits and is_token and weight_quant.symmetric and is_dynamic) - def _is_fp8_w8a8(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w8a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: # Confirm weights and activations quantized. if weight_quant is None or input_quant is None: return False @@ -358,11 +358,12 @@ class CompressedTensorsConfig(QuantizationConfig): and input_quant.type == QuantizationType.FLOAT) is_symmetric_weight = weight_quant.symmetric is_static_weight = not weight_quant.dynamic - is_per_tensor_or_channel_weight = (weight_quant.strategy in [ - QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL + is_tensor_or_channel_or_block_weight = (weight_quant.strategy in [ + QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL, + QuantizationStrategy.BLOCK ]) if not (is_floating_point and is_symmetric_weight and is_static_weight - and is_per_tensor_or_channel_weight): + and is_tensor_or_channel_or_block_weight): return False # Dynamic quantization is always supported if weights supported. @@ -375,8 +376,8 @@ class CompressedTensorsConfig(QuantizationConfig): input_quant.strategy == QuantizationStrategy.TENSOR) return is_symmetric_activation and is_per_tensor_activation - def _is_fp8_w4a8(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w4a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: if not weight_quant or not input_quant: return False is_weight_4_bits = weight_quant.num_bits == 4 @@ -392,24 +393,24 @@ class CompressedTensorsConfig(QuantizationConfig): return (is_weight_4_bits and is_activation_8_bits and is_token and is_symmetric and is_dynamic) - def _is_fp8_w4a8_sm90(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w4a8_sm90(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: return (self._check_scheme_supported(90, error=False, match_exact=True) and self._is_fp8_w4a8(weight_quant, input_quant)) - def _is_fp8_w8a8_sm90(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w8a8_sm90(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: return (self._check_scheme_supported(90, error=False, match_exact=True) and self._is_fp8_w8a8(weight_quant, input_quant)) - def _is_fp8_w8a8_sm100(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w8a8_sm100(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: return (self._check_scheme_supported( 100, error=False, match_exact=True) and self._is_fp8_w8a8(weight_quant, input_quant)) - def _is_fp8_w8a16(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_fp8_w8a16(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: # Confirm weights quantized. if weight_quant is None: return False @@ -421,18 +422,19 @@ class CompressedTensorsConfig(QuantizationConfig): # Confirm weight scheme is supported. is_symmetric_weight = weight_quant.symmetric is_static_weight = not weight_quant.dynamic - is_per_tensor_or_channel_weight = (weight_quant.strategy in [ - QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL + is_tensor_or_channel_or_block_weight = (weight_quant.strategy in [ + QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL, + QuantizationStrategy.BLOCK ]) if not (is_symmetric_weight and is_static_weight # noqa: SIM103 - and is_per_tensor_or_channel_weight): + and is_tensor_or_channel_or_block_weight): return False # All conditions satisfied. return True - def _is_wNa16_group_channel(self, weight_quant: BaseModel, - input_quant: BaseModel) -> bool: + def _is_wNa16_group_channel(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: input_quant_none = input_quant is None is_channel_group = ( weight_quant.strategy == QuantizationStrategy.CHANNEL.value @@ -443,8 +445,8 @@ class CompressedTensorsConfig(QuantizationConfig): def _get_scheme_from_parts( self, - weight_quant: BaseModel, - input_quant: BaseModel, + weight_quant: QuantizationArgs, + input_quant: QuantizationArgs, format: Optional[str] = None) -> "CompressedTensorsScheme": # use the per-layer format if defined, otherwise, use global format @@ -496,7 +498,7 @@ class CompressedTensorsConfig(QuantizationConfig): CompressedTensorsW8A8Fp8.get_min_capability(), error=False) if is_fp8_w8a8_supported: return CompressedTensorsW8A8Fp8( - strategy=weight_quant.strategy, + weight_quant=weight_quant, is_static_input_scheme=(input_quant and not input_quant.dynamic)) else: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index d984e89d9e02a..d42ae22c51393 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -4,28 +4,41 @@ from typing import Callable, Optional import torch -from compressed_tensors.quantization import QuantizationStrategy +from compressed_tensors.quantization import (QuantizationArgs, + QuantizationStrategy) from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + apply_fp8_block_linear, check_aiter_fp8_linear_support, + create_fp8_input_scale, create_fp8_scale_parameter, + create_fp8_weight_parameter, maybe_post_process_fp8_weight_block, + process_fp8_weight_block_strategy, process_fp8_weight_channel_strategy, + process_fp8_weight_tensor_strategy, validate_fp8_block_shape) from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - Fp8LinearOp, maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz, - requantize_with_max_scale) -from vllm.model_executor.parameter import (ChannelQuantScaleParameter, - ModelWeightParameter, + Fp8LinearOp, cutlass_block_fp8_supported, maybe_create_device_identity) +from vllm.model_executor.parameter import (BlockQuantScaleParameter, + ChannelQuantScaleParameter, PerTensorScaleParameter) -from vllm.platforms import current_platform __all__ = ["CompressedTensorsW8A8Fp8"] +strategy_to_parameter_type = { + QuantizationStrategy.BLOCK: BlockQuantScaleParameter, + QuantizationStrategy.CHANNEL: ChannelQuantScaleParameter, + QuantizationStrategy.TENSOR: PerTensorScaleParameter, +} + class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): - def __init__(self, strategy: str, is_static_input_scheme: bool): - self.strategy = strategy + def __init__(self, weight_quant: QuantizationArgs, + is_static_input_scheme: bool): + self.weight_quant = weight_quant + self.strategy = weight_quant.strategy self.out_dtype = torch.get_default_dtype() self.is_static_input_scheme = is_static_input_scheme self.act_q_group_shape = GroupShape.PER_TENSOR \ @@ -34,61 +47,84 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): act_quant_static=self.is_static_input_scheme, act_quant_group_shape=self.act_q_group_shape) + self.weight_block_size = self.weight_quant.block_structure + self.cutlass_block_fp8_supported = cutlass_block_fp8_supported() + self.use_aiter_and_is_supported = check_aiter_fp8_linear_support() + @classmethod def get_min_capability(cls) -> int: # lovelace and up return 89 + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + weight_loader: Callable, **kwargs): + maybe_create_device_identity() + + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + layer.weight_block_size = None + + if self.strategy == QuantizationStrategy.BLOCK: + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size + # Validate block quantization shapes + validate_fp8_block_shape(layer, input_size, output_size, + input_size_per_partition, + output_partition_sizes, + self.weight_block_size) + + # WEIGHT + weight = create_fp8_weight_parameter(output_size_per_partition, + input_size_per_partition, + weight_loader) + layer.register_parameter("weight", weight) + + # WEIGHT SCALE + weight_scale = create_fp8_scale_parameter( + strategy_to_parameter_type[self.strategy], output_partition_sizes, + input_size_per_partition, layer.weight_block_size, weight_loader) + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE + if self.is_static_input_scheme: + input_scale = create_fp8_input_scale(output_partition_sizes, + weight_loader) + layer.register_parameter("input_scale", input_scale) + def process_weights_after_loading(self, layer) -> None: - # If per tensor, when we have a fused module (e.g. QKV) with per - # tensor scales (thus N scales being passed to the kernel), - # requantize so we can always run per tensor if self.strategy == QuantizationStrategy.TENSOR: - max_w_scale, weight = requantize_with_max_scale( - weight=layer.weight, - weight_scale=layer.weight_scale, - logical_widths=layer.logical_widths, - ) + weight, weight_scale, input_scale = ( + process_fp8_weight_tensor_strategy( + layer.weight, layer.weight_scale, layer.logical_widths, + getattr(layer, 'input_scale', None))) + weight = weight.t() - if current_platform.is_fp8_fnuz(): - input_scale = getattr(layer, 'input_scale', None) - - weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( - weight=weight, - weight_scale=max_w_scale, - input_scale=input_scale) - if input_scale is not None: - layer.input_scale = Parameter(input_scale, - requires_grad=False) - - layer.weight = Parameter(weight.t(), requires_grad=False) - layer.weight_scale = Parameter(max_w_scale, requires_grad=False) - - # If channelwise, scales are already lined up, so just transpose. elif self.strategy == QuantizationStrategy.CHANNEL: - weight = layer.weight + weight, weight_scale, input_scale = ( + process_fp8_weight_channel_strategy( + layer.weight, layer.weight_scale, + getattr(layer, 'input_scale', None))) + weight = weight.t() - if current_platform.is_fp8_fnuz(): - input_scale = getattr(layer, 'input_scale', None) - - weight, weight_scale, input_scale = \ - normalize_e4m3fn_to_e4m3fnuz( - weight=weight, - weight_scale=layer.weight_scale, - input_scale=input_scale) - if input_scale is not None: - layer.input_scale = Parameter(input_scale, - requires_grad=False) - else: - weight_scale = layer.weight_scale.data - - layer.weight = Parameter(weight.t(), requires_grad=False) - # required by torch.compile to be torch.nn.Parameter - layer.weight_scale = Parameter(weight_scale, requires_grad=False) + elif self.strategy == QuantizationStrategy.BLOCK: + assert self.is_static_input_scheme is False + weight, weight_scale = process_fp8_weight_block_strategy( + layer.weight, layer.weight_scale) + input_scale = None else: raise ValueError(f"Unknown quantization strategy {self.strategy}") + # required by torch.compile to be torch.nn.Parameter + layer.weight = Parameter(weight.data, requires_grad=False) + layer.weight_scale = Parameter(weight_scale.data, requires_grad=False) + if input_scale is not None: + layer.input_scale = Parameter(input_scale.data, + requires_grad=False) + # INPUT SCALE if self.is_static_input_scheme and hasattr(layer, 'input_scale'): layer.input_scale = Parameter(layer.input_scale.max(), @@ -96,58 +132,23 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): else: layer.input_scale = None - def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: list[int], - input_size_per_partition: int, - params_dtype: torch.dtype, weight_loader: Callable, - **kwargs): - maybe_create_device_identity() - - output_size_per_partition = sum(output_partition_sizes) - layer.logical_widths = output_partition_sizes - - # WEIGHT - weight = ModelWeightParameter(data=torch.empty( - output_size_per_partition, - input_size_per_partition, - dtype=torch.float8_e4m3fn), - input_dim=1, - output_dim=0, - weight_loader=weight_loader) - layer.register_parameter("weight", weight) - - # WEIGHT SCALE - # TODO: update create_xxx_parameter functions to return - # the newly added parameters - if self.strategy == QuantizationStrategy.CHANNEL: - weight_scale = ChannelQuantScaleParameter( - data=torch.empty((sum(output_partition_sizes), 1), - dtype=torch.float32), - output_dim=0, - weight_loader=weight_loader) - else: - assert self.strategy == QuantizationStrategy.TENSOR - weight_scale = PerTensorScaleParameter(data=torch.empty( - len(output_partition_sizes), dtype=torch.float32), - weight_loader=weight_loader) - - # min requirement for fp8 kernels - weight_scale[:] = torch.finfo(torch.float32).min - layer.register_parameter("weight_scale", weight_scale) - - # INPUT SCALE - if self.is_static_input_scheme: - input_scale = PerTensorScaleParameter(data=torch.empty( - len(output_partition_sizes), dtype=torch.float32), - weight_loader=weight_loader) - input_scale[:] = torch.finfo(torch.float32).min - layer.register_parameter("input_scale", input_scale) + if self.strategy == QuantizationStrategy.BLOCK: + maybe_post_process_fp8_weight_block( + layer, self.cutlass_block_fp8_supported) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: + if layer.weight_block_size is not None: + return apply_fp8_block_linear( + layer, + input=x, + bias=bias, + cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, + use_aiter_and_is_supported=self.use_aiter_and_is_supported) + return self.fp8_linear.apply(input=x, weight=layer.weight, weight_scale=layer.weight_scale, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e75094c54743c..aec9c79f1ea82 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -4,7 +4,6 @@ from typing import TYPE_CHECKING, Any, Callable, Optional, Union import torch -import torch.nn.functional as F from torch.nn import Module from torch.nn.parameter import Parameter @@ -32,8 +31,12 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights, select_cutlass_fp8_gemm_impl, swap_w13_to_w31) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace, - should_use_deepgemm_for_fp8_linear) + apply_fp8_block_linear, check_aiter_fp8_linear_support, + create_fp8_input_scale, create_fp8_scale_parameter, + create_fp8_weight_parameter, get_col_major_tma_aligned_tensor, + maybe_post_process_fp8_weight_block, process_fp8_weight_block_strategy, + process_fp8_weight_tensor_strategy, requant_weight_ue8m0_inplace, + validate_fp8_block_shape) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin, prepare_moe_fp8_layer_for_marlin) @@ -42,8 +45,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp, all_close_1d, cutlass_block_fp8_supported, cutlass_fp8_supported, maybe_create_device_identity, - normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize, - requantize_with_max_scale) + normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.parameter import (BlockQuantScaleParameter, ModelWeightParameter, PerTensorScaleParameter) @@ -233,14 +235,10 @@ class Fp8LinearMethod(LinearMethodBase): if current_platform.is_rocm(): self.use_marlin = False - # AITER is only supported on ROCm and only for FP8_FNUZ - # and at the moment are MI300 series - self.use_aiter_and_is_supported = (current_platform.is_rocm() - and envs.VLLM_ROCM_USE_AITER - and envs.VLLM_ROCM_USE_AITER_LINEAR - and current_platform.is_fp8_fnuz()) + self.use_aiter_and_is_supported = check_aiter_fp8_linear_support() - self.block_quant = self.quant_config.weight_block_size is not None + self.weight_block_size = self.quant_config.weight_block_size + self.block_quant = self.weight_block_size is not None self.act_q_static = self.quant_config.activation_scheme == "static" # Use per-token quantization for better perf if dynamic and cutlass if not self.act_q_static and cutlass_fp8_supported(): @@ -273,51 +271,27 @@ class Fp8LinearMethod(LinearMethodBase): layer.weight_block_size = None if self.block_quant: - tp_size = getattr(layer, "tp_size", - get_tensor_model_parallel_world_size()) - assert self.quant_config.weight_block_size is not None - layer.weight_block_size = self.quant_config.weight_block_size - block_n, block_k = ( - self.quant_config.weight_block_size[0], - self.quant_config.weight_block_size[1], - ) - # Required by row parallel - if (tp_size > 1 - and input_size // input_size_per_partition == tp_size - and input_size_per_partition % block_k != 0): - raise ValueError( - f"Weight input_size_per_partition = " - f"{input_size_per_partition} is not divisible by " - f"weight quantization block_k = {block_k}.") - # Required by column parallel or enabling merged weights - is_tp_split = (tp_size > 1 and - output_size // output_size_per_partition == tp_size) - is_merged_gemm = len(output_partition_sizes) > 1 - if is_tp_split or is_merged_gemm: - sizes_to_check = output_partition_sizes - if not is_tp_split and is_merged_gemm: - # In case of merged matrices, we allow the last - # matrix to not be a multiple of block size - sizes_to_check = output_partition_sizes[:-1] - for output_partition_size in sizes_to_check: - if output_partition_size % block_n != 0: - raise ValueError( - f"Weight output_partition_size = " - f"{output_partition_size} is not divisible by " - f"weight quantization block_n = {block_n}.") + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size + validate_fp8_block_shape(layer, input_size, output_size, + input_size_per_partition, + output_partition_sizes, + self.weight_block_size) # WEIGHT - weight_dtype = (torch.float8_e4m3fn - if self.quant_config.is_checkpoint_fp8_serialized else - params_dtype) - - weight = ModelWeightParameter(data=torch.empty( - output_size_per_partition, - input_size_per_partition, - dtype=weight_dtype), - input_dim=1, - output_dim=0, - weight_loader=weight_loader) + if self.quant_config.is_checkpoint_fp8_serialized: + weight = create_fp8_weight_parameter(output_size_per_partition, + input_size_per_partition, + weight_loader) + else: + # For non-serialized checkpoints, use original dtype + weight = ModelWeightParameter(data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=params_dtype), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) layer.register_parameter("weight", weight) # If checkpoint is serialized fp8, load them. @@ -325,154 +299,87 @@ class Fp8LinearMethod(LinearMethodBase): if self.quant_config.is_checkpoint_fp8_serialized: # WEIGHT SCALE if not self.block_quant: - scale = PerTensorScaleParameter( - data=torch.empty(len(output_partition_sizes), - dtype=torch.float32), - weight_loader=weight_loader, - ) - scale[:] = torch.finfo(torch.float32).min + scale = create_fp8_scale_parameter(PerTensorScaleParameter, + output_partition_sizes, + input_size_per_partition, + None, weight_loader) set_weight_attrs(scale, {"scale_type": "weight_scale"}) layer.register_parameter("weight_scale", scale) else: - assert self.quant_config.activation_scheme == "dynamic" - scale = BlockQuantScaleParameter( - data=torch.empty( - (output_size_per_partition + block_n - 1) // block_n, - (input_size_per_partition + block_k - 1) // block_k, - dtype=torch.float32, - ), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - scale[:] = torch.finfo(torch.float32).min + assert not self.act_q_static + assert self.weight_block_size is not None + scale = create_fp8_scale_parameter(BlockQuantScaleParameter, + output_partition_sizes, + input_size_per_partition, + self.weight_block_size, + weight_loader) set_weight_attrs(scale, {"scale_type": "weight_scale"}) # The weight_scale_inv name is intentional for deepseekv3 layer.register_parameter("weight_scale_inv", scale) # INPUT ACTIVATION SCALE - if self.quant_config.activation_scheme == "static": - scale = PerTensorScaleParameter(data=torch.empty( - len(output_partition_sizes), dtype=torch.float32), - weight_loader=weight_loader) - - scale[:] = torch.finfo(torch.float32).min + if self.act_q_static: + scale = create_fp8_input_scale(output_partition_sizes, + weight_loader) set_weight_attrs(scale, {"scale_type": "input_scale"}) layer.register_parameter("input_scale", scale) else: layer.register_parameter("input_scale", None) - def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor: - # Pad the weight tensor. This is an optimization on ROCm platform, which - # can benefit from tensors located far enough from one another in memory - if (envs.VLLM_ROCM_FP8_PADDING and current_platform.is_rocm() - and weight.stride(-1) == 1 - and (weight.stride(-2) * weight.element_size()) % 512 == 0): - num_pad = 256 // weight.element_size() - weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad] - torch.cuda.empty_cache() - return weight - def process_weights_after_loading(self, layer: Module) -> None: size_k_first = True + input_scale = None # TODO(rob): refactor block quant into separate class. if self.block_quant: - assert self.quant_config.activation_scheme == "dynamic" + assert not self.act_q_static size_k_first = False - if current_platform.is_fp8_fnuz(): - weight, weight_scale_inv, _ = \ - normalize_e4m3fn_to_e4m3fnuz( - weight=layer.weight, - weight_scale=layer.weight_scale_inv) - else: - weight = layer.weight.data - weight_scale_inv = layer.weight_scale_inv.data - weight = self._maybe_pad_weight(weight) - - # Torch.compile cannot use Parameter subclasses. - layer.weight = Parameter(weight, requires_grad=False) - layer.weight_scale_inv = Parameter(weight_scale_inv, - requires_grad=False) + weight, weight_scale = process_fp8_weight_block_strategy( + layer.weight, layer.weight_scale_inv) + # Delete the weight_scale_inv parameter to avoid confusion + # with the weight_scale parameter + del layer.weight_scale_inv # If checkpoint not serialized fp8, quantize the weights. elif not self.quant_config.is_checkpoint_fp8_serialized: qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) + weight = qweight.t() - # Update the layer with the new values. - layer.weight = Parameter(qweight.t(), requires_grad=False) - layer.weight_scale = Parameter(weight_scale, requires_grad=False) - # layer.input_scale is None indicates dynamic quant and scale is - # computed from input. - layer.input_scale = None - - # If checkpoint is fp8, handle that there are N scales for N + # If checkpoint is fp8 per-tensor, handle that there are N scales for N # shards in a fused module else: - layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data, - requires_grad=False) - if self.quant_config.activation_scheme == "static": - layer.input_scale = torch.nn.Parameter(layer.input_scale.data, - requires_grad=False) - weight = layer.weight weight_scale = layer.weight_scale # If using w8a8, torch._scaled_mm needs per tensor, so # requantize the logical shards as a single weight. if not self.use_marlin: - # Dequant -> Quant with max scale so we can run per tensor. - if current_platform.is_fp8_fnuz(): - weight, weight_scale, input_scale = \ - normalize_e4m3fn_to_e4m3fnuz( - weight=weight, - weight_scale=weight_scale, - input_scale=layer.input_scale) - if input_scale is not None: - layer.input_scale = Parameter(input_scale, - requires_grad=False) + weight, weight_scale, input_scale = ( + process_fp8_weight_tensor_strategy( + weight, weight_scale, layer.logical_widths, + getattr(layer, 'input_scale', None))) + if self.act_q_static: + assert input_scale is not None + input_scale = input_scale.max() + weight = weight.t() - weight_scale, weight = requantize_with_max_scale( - weight=weight, - weight_scale=weight_scale, - logical_widths=layer.logical_widths, - ) - - weight = self._maybe_pad_weight(weight) - # Update layer with new values. - layer.weight = Parameter(weight.t(), requires_grad=False) - layer.weight_scale = Parameter(weight_scale, requires_grad=False) - if self.quant_config.activation_scheme == "static": - layer.input_scale = Parameter(layer.input_scale.max(), - requires_grad=False) + # Update layer with new values. + layer.weight = Parameter(weight.data, requires_grad=False) + layer.weight_scale = Parameter(weight_scale.data, requires_grad=False) + layer.input_scale = Parameter( + input_scale, + requires_grad=False) if input_scale is not None else None if self.use_marlin: prepare_fp8_layer_for_marlin(layer, size_k_first) # Activations not quantized for marlin. del layer.input_scale + return - # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to - # requantize the weight and input to the specific scale - # at the same time. - if is_deep_gemm_e8m0_used() and self.block_quant: - assert layer.weight_block_size is not None - block_sz = tuple(layer.weight_block_size) - requant_weight_ue8m0_inplace( - layer.weight.data, - layer.weight_scale_inv.data if hasattr( - layer, "weight_scale_inv") else layer.weight_scale.data, - block_sz, - ) - - # SM90 Block FP8 CUTLASS requires row-major weight scales - if (self.block_quant and current_platform.is_device_capability(90) - and self.cutlass_block_fp8_supported - and not should_use_deepgemm_for_fp8_linear( - torch.bfloat16, layer.weight)): - layer.weight_scale_inv = Parameter( - layer.weight_scale_inv.data.T.contiguous(), - requires_grad=False) + if self.block_quant: + maybe_post_process_fp8_weight_block( + layer, self.cutlass_block_fp8_supported) def apply(self, layer: torch.nn.Module, @@ -490,18 +397,12 @@ class Fp8LinearMethod(LinearMethodBase): bias=bias) if self.block_quant: - assert self.quant_config.weight_block_size is not None - - return torch.ops.vllm.apply_w8a8_block_fp8_linear( + return apply_fp8_block_linear( + layer, input=x, - weight=layer.weight, - block_size=self.quant_config.weight_block_size, - weight_scale=layer.weight_scale_inv, - input_scale=layer.input_scale, bias=bias, cutlass_block_fp8_supported=self.cutlass_block_fp8_supported, - use_aiter_and_is_supported=self.use_aiter_and_is_supported, - ) + use_aiter_and_is_supported=self.use_aiter_and_is_supported) return self.fp8_linear.apply(input=x, weight=layer.weight, @@ -528,7 +429,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): super().__init__(layer.moe_config) self.layer = layer self.quant_config = quant_config - self.block_quant = self.quant_config.weight_block_size is not None + self.weight_block_size = self.quant_config.weight_block_size + self.block_quant = self.weight_block_size is not None self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None self.fused_experts: Optional[ @@ -590,12 +492,12 @@ class Fp8MoEMethod(FusedMoEMethodBase): if self.quant_config.is_checkpoint_fp8_serialized: params_dtype = torch.float8_e4m3fn if self.block_quant: - assert self.quant_config.weight_block_size is not None - layer.weight_block_size = self.quant_config.weight_block_size + assert self.weight_block_size is not None + layer.weight_block_size = self.weight_block_size tp_size = get_tensor_model_parallel_world_size() block_n, block_k = ( - self.quant_config.weight_block_size[0], - self.quant_config.weight_block_size[1], + self.weight_block_size[0], + self.weight_block_size[1], ) # NOTE: To ensure proper alignment of the block-wise quantization # scales, the output_size of the weights for both the gate and up @@ -952,7 +854,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): "BatchedTritonOrDeepGemmExperts(%s): " "max_tokens_per_rank=%s, block_size=%s, per_act_token=%s", self.__class__.__name__, max_num_tokens_per_rank, - self.quant_config.weight_block_size, False) + self.weight_block_size, False) return BatchedTritonOrDeepGemmExperts( max_num_tokens=max_num_tokens_per_rank, num_dispatchers=prepare_finalize.num_dispatchers(), @@ -969,8 +871,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): else: logger.debug( "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s", - self.__class__.__name__, self.quant_config.weight_block_size, - False) + self.__class__.__name__, self.weight_block_size, False) return TritonOrDeepGemmExperts( quant_config=self.moe_quant_config, allow_deep_gemm=self.allow_deep_gemm, @@ -988,7 +889,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): if self.block_quant else layer.w2_weight_scale), a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, - block_shape=self.quant_config.weight_block_size, + block_shape=self.weight_block_size, ) def apply( @@ -1046,7 +947,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): intermediate_size=layer.intermediate_size_per_partition, expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, - block_shape=self.quant_config.weight_block_size, + block_shape=self.weight_block_size, routed_scaling=routed_scaling_factor, ) else: diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index bbe0c6f6d38ec..fc12483de0c0e 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -17,6 +17,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( group_broadcast) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( CUTLASS_BLOCK_FP8_SUPPORTED) +from vllm.model_executor.parameter import (BlockQuantScaleParameter, + ChannelQuantScaleParameter, + PerTensorScaleParameter) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import cdiv, direct_register_custom_op @@ -794,3 +797,220 @@ def requant_weight_ue8m0_inplace( # Write back the results in-place. w_q.copy_(w_requant) s_old.copy_(s_requant) + + +def check_aiter_fp8_linear_support() -> bool: + """AITER is only supported on ROCm and only for FP8_FNUZ + and at the moment are MI300 series""" + return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER + and envs.VLLM_ROCM_USE_AITER_LINEAR + and current_platform.is_fp8_fnuz()) + + +def _maybe_pad_fp8_weight(weight: torch.Tensor) -> torch.Tensor: + """Pad the weight tensor. This is an optimization on ROCm platform, which + can benefit from tensors located far enough from one another in memory""" + if (envs.VLLM_ROCM_FP8_PADDING and current_platform.is_rocm() + and weight.stride(-1) == 1 + and (weight.stride(-2) * weight.element_size()) % 512 == 0): + num_pad = 256 // weight.element_size() + import torch.nn.functional as F + weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad] + torch.cuda.empty_cache() + return weight + + +def validate_fp8_block_shape(layer: torch.nn.Module, input_size: int, + output_size: int, input_size_per_partition: int, + output_partition_sizes: list[int], + block_size: list[int]) -> None: + """Validate block quantization shapes for tensor parallelism.""" + from vllm.distributed import get_tensor_model_parallel_world_size + + tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size()) + block_n, block_k = block_size[0], block_size[1] + + # Required by row parallel + if (tp_size > 1 and input_size // input_size_per_partition == tp_size + and input_size_per_partition % block_k != 0): + raise ValueError( + f"Weight input_size_per_partition = {input_size_per_partition} " + f"is not divisible by weight quantization block_k = {block_k}.") + + # Required by column parallel or enabling merged weights + is_tp_split = (tp_size > 1 + and output_size // sum(output_partition_sizes) == tp_size) + is_merged_gemm = len(output_partition_sizes) > 1 + if is_tp_split or is_merged_gemm: + sizes_to_check = output_partition_sizes + if not is_tp_split and is_merged_gemm: + # In case of merged matrices, we allow the last + # matrix to not be a multiple of block size + sizes_to_check = output_partition_sizes[:-1] + for output_partition_size in sizes_to_check: + if output_partition_size % block_n != 0: + raise ValueError( + f"Weight output_partition_size = " + f"{output_partition_size} is not divisible by " + f"weight quantization block_n = {block_n}.") + + +def create_fp8_weight_parameter( + output_size_per_partition: int, input_size_per_partition: int, + weight_loader: Optional[Callable]) -> torch.nn.Parameter: + """Create FP8 weight parameter.""" + from vllm.model_executor.parameter import ModelWeightParameter + + return ModelWeightParameter(data=torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + + +def create_fp8_scale_parameter( + parameter_type: torch.nn.Parameter, output_partition_sizes: list[int], + input_size_per_partition: int, block_size: Optional[list[int]], + weight_loader: Optional[Callable]) -> torch.nn.Parameter: + """Create scale parameter based on quantization strategy.""" + if parameter_type == ChannelQuantScaleParameter: + scale = parameter_type(data=torch.empty( + (sum(output_partition_sizes), 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) + elif parameter_type == BlockQuantScaleParameter: + assert block_size is not None + block_n, block_k = block_size[0], block_size[1] + output_size_per_partition = sum(output_partition_sizes) + scale = parameter_type( + data=torch.empty( + (output_size_per_partition + block_n - 1) // block_n, + (input_size_per_partition + block_k - 1) // block_k, + dtype=torch.float32, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + elif parameter_type == PerTensorScaleParameter: + scale = parameter_type(data=torch.empty(len(output_partition_sizes), + dtype=torch.float32), + weight_loader=weight_loader) + else: + raise ValueError(f"Unknown parameter type: {parameter_type}") + + scale[:] = torch.finfo(torch.float32).min + return scale + + +def create_fp8_input_scale( + output_partition_sizes: list[int], + weight_loader: Optional[Callable]) -> torch.nn.Parameter: + """Create input scale parameter for static activation quantization.""" + from vllm.model_executor.parameter import PerTensorScaleParameter + + scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + scale[:] = torch.finfo(torch.float32).min + return scale + + +def process_fp8_weight_tensor_strategy( + weight: torch.Tensor, + weight_scale: torch.Tensor, + logical_widths: list[int], + input_scale: Optional[torch.Tensor] = None +) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + """Process weights for tensor-wise quantization strategy.""" + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale) + + if current_platform.is_fp8_fnuz(): + weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, weight_scale=weight_scale, input_scale=input_scale) + + # Requantize with max scale + weight_scale, weight = requantize_with_max_scale( + weight=weight, + weight_scale=weight_scale, + logical_widths=logical_widths, + ) + + weight = _maybe_pad_fp8_weight(weight) + return weight, weight_scale, input_scale + + +def process_fp8_weight_channel_strategy( + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: Optional[torch.Tensor] = None +) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + """Process weights for channel-wise quantization strategy.""" + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + normalize_e4m3fn_to_e4m3fnuz) + + if current_platform.is_fp8_fnuz(): + weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, weight_scale=weight_scale, input_scale=input_scale) + + return weight, weight_scale, input_scale + + +def process_fp8_weight_block_strategy( + weight: torch.Tensor, + weight_scale: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """Process weights for block-wise quantization strategy.""" + from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + normalize_e4m3fn_to_e4m3fnuz) + + if current_platform.is_fp8_fnuz(): + weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz( + weight=weight, weight_scale=weight_scale) + + weight = _maybe_pad_fp8_weight(weight) + return weight, weight_scale + + +def maybe_post_process_fp8_weight_block(layer: torch.nn.Module, + cutlass_block_fp8_supported: bool): + assert layer.weight_block_size is not None + + from vllm.utils.deep_gemm import (is_deep_gemm_e8m0_used, + should_use_deepgemm_for_fp8_linear) + + # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to + # requantize the weight and input to the specific scale + # at the same time. + if is_deep_gemm_e8m0_used(): + block_sz = tuple(layer.weight_block_size) + requant_weight_ue8m0_inplace(layer.weight.data, + layer.weight_scale.data, block_sz) + # SM90 Block FP8 CUTLASS requires row-major weight scales + elif (current_platform.is_device_capability(90) + and cutlass_block_fp8_supported + and not should_use_deepgemm_for_fp8_linear(torch.bfloat16, + layer.weight)): + layer.weight_scale = torch.nn.Parameter( + layer.weight_scale.data.T.contiguous(), requires_grad=False) + + +def apply_fp8_block_linear(layer: torch.nn.Module, input: torch.Tensor, + bias: Optional[torch.Tensor], + cutlass_block_fp8_supported: bool, + use_aiter_and_is_supported: bool) -> torch.Tensor: + """Apply block-wise FP8 linear operation.""" + assert layer.weight_block_size is not None + + return torch.ops.vllm.apply_w8a8_block_fp8_linear( + input=input, + weight=layer.weight, + block_size=layer.weight_block_size, + weight_scale=layer.weight_scale, + input_scale=layer.input_scale, + bias=bias, + cutlass_block_fp8_supported=cutlass_block_fp8_supported, + use_aiter_and_is_supported=use_aiter_and_is_supported, + ) From bc19d7598566ae81b3f69b43cbc2bd34aa5497c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 18 Sep 2025 15:56:07 +0200 Subject: [PATCH 35/58] [Misc] Add kv-connector label (#25156) Signed-off-by: NickLucche --- .github/mergify.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index 94198b1251e09..75ee3e3c55b46 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -302,3 +302,20 @@ pull_request_rules: label: remove: - needs-rebase + +- name: label-kv-connector + description: Automatically apply kv-connector label + conditions: + - or: + - files~=^examples/online_serving/disaggregated[^/]*/.* + - files~=^examples/offline_inference/disaggregated[^/]*/.* + - files~=^examples/others/lmcache/ + - files~=^tests/v1/kv_connector/ + - files~=^vllm/distributed/kv_transfer/ + - title~=(?i)\bP/?D\b + - title~=(?i)NIXL + - title~=(?i)LMCache + actions: + label: + add: + - kv-connector \ No newline at end of file From 01a583fea40571986ffe277549e5bb441d409768 Mon Sep 17 00:00:00 2001 From: jvlunteren <161835099+jvlunteren@users.noreply.github.com> Date: Thu, 18 Sep 2025 16:27:01 +0200 Subject: [PATCH 36/58] [Kernel] Decouple Tile Size from Block Size in Triton Unified Attention Kernel (#21197) Signed-off-by: Jan van Lunteren --- .../test_triton_unified_attention.py | 3 - .../attention/ops/triton_unified_attention.py | 122 ++++++++++-------- 2 files changed, 70 insertions(+), 55 deletions(-) diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index 4b97d51e6ed21..ab91560e995c8 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -102,9 +102,6 @@ def test_triton_unified_attn( ) -> None: torch.set_default_device("cuda") - if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32: - pytest.skip("block size must be at least 32 for fp8") - current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index d2ad2f7e8d2aa..591b68bfa6468 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -73,6 +73,7 @@ def kernel_unified_attention_2d( output_stride_1: tl.int64, # int, should be equal to head_size qq_bias_stride_0: tl.int64, # int BLOCK_SIZE: tl.constexpr, # int + TILE_SIZE: tl.constexpr, # int must be power of 2 HEAD_SIZE: tl.constexpr, # int HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 USE_ALIBI_SLOPES: tl.constexpr, # bool @@ -118,6 +119,7 @@ def kernel_unified_attention_2d( offs_m = tl.arange(0, BLOCK_M) offs_d = tl.arange(0, HEAD_SIZE_PADDED) + offs_t = tl.arange(0, TILE_SIZE) query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv query_offset_0 = cur_batch_in_all_start_index + query_pos @@ -177,31 +179,32 @@ def kernel_unified_attention_2d( # actual sequence length max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len) - # calculate the number of tiles (blocks) that need to be processed to - # cover the longest sequence prefix (due to causal masking, blocks beyond + # calculate the number of tiles that need to be processed to + # cover the longest sequence prefix (due to causal masking, tiles beyond # this prefix can be skipped) - num_blocks = cdiv_fn(max_seq_prefix_len, BLOCK_SIZE) + num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) # iterate through tiles - for j in range(0, num_blocks): + for j in range(0, num_tiles): + seq_offset = j * TILE_SIZE + offs_t + tile_mask = seq_offset < max_seq_prefix_len - physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j) + physical_block_idx = tl.load(block_tables_ptr + block_table_offset + + seq_offset // BLOCK_SIZE).to(tl.int64) - offs_n = tl.arange(0, BLOCK_SIZE) - - v_offset = (physical_block_idx * stride_v_cache_0 + + v_offset = (physical_block_idx[:, None] * stride_v_cache_0 + kv_head_idx * stride_v_cache_2 + offs_d[None, :] * stride_v_cache_3 + - offs_n[:, None] * stride_v_cache_1) + (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1) - k_offset = (physical_block_idx * stride_k_cache_0 + + k_offset = (physical_block_idx[None, :] * stride_k_cache_0 + kv_head_idx * stride_k_cache_2 + offs_d[:, None] * stride_k_cache_3 + - offs_n[None, :] * stride_k_cache_1) + (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1) - # K : (HEAD_SIZE, BLOCK_SIZE) + # K : (HEAD_SIZE, TILE_SIZE) K_load = tl.load(key_cache_ptr + k_offset, - mask=dim_mask[:, None], + mask=dim_mask[:, None] & tile_mask[None, :], other=0.0) if K_load.dtype.is_fp8(): @@ -212,9 +215,9 @@ def kernel_unified_attention_2d( else: K = K_load - # V : (BLOCK_SIZE, HEAD_SIZE) + # V : (TILE_SIZE, HEAD_SIZE) V_load = tl.load(value_cache_ptr + v_offset, - mask=dim_mask[None, :], + mask=dim_mask[None, :] & tile_mask[:, None], other=0.0) if V_load.dtype.is_fp8(): @@ -225,12 +228,10 @@ def kernel_unified_attention_2d( else: V = V_load - seq_offset = j * BLOCK_SIZE + offs_n - seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 - # S : (BLOCK_M, BLOCK_SIZE) - S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32) + # S : (BLOCK_M, TILE_SIZE) + S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32) S += scale * tl.dot(Q, K) @@ -262,11 +263,12 @@ def kernel_unified_attention_2d( # compute running maximum # m_j : (BLOCK_M,) m_j = tl.maximum(M, tl.max(S, axis=1)) + # For sliding window there's a chance the max is -inf due to masking of # the entire row. In this case we need to set m_j 0 to avoid NaN m_j = tl.where(m_j > float("-inf"), m_j, 0.0) - # P : (BLOCK_M, BLOCK_SIZE) + # P : (BLOCK_M, TILE_SIZE) P = tl.exp(S - m_j[:, None]) # l_j : (BLOCK_M,) @@ -327,6 +329,7 @@ def kernel_unified_attention_3d( query_stride_1: tl.int64, # int, should be equal to head_size qq_bias_stride_0: tl.int64, # int BLOCK_SIZE: tl.constexpr, # int + TILE_SIZE: tl.constexpr, # int, must be power of 2 HEAD_SIZE: tl.constexpr, # int HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 USE_ALIBI_SLOPES: tl.constexpr, # bool @@ -374,20 +377,19 @@ def kernel_unified_attention_3d( # number of segments for this particular sequence num_segments = NUM_SEGMENTS_PER_SEQ - blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE) + tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE) - if segm_idx * blocks_per_segment * BLOCK_SIZE >= seq_len: + if segm_idx * tiles_per_segment * TILE_SIZE >= seq_len: return offs_m = tl.arange(0, BLOCK_M) offs_d = tl.arange(0, HEAD_SIZE_PADDED) - + offs_t = tl.arange(0, TILE_SIZE) query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv query_offset_0 = cur_batch_in_all_start_index + query_pos query_offset_1 = kv_head_idx * num_queries_per_kv + \ offs_m % num_queries_per_kv - query_offset = (query_offset_0[:, None] * query_stride_0 + query_offset_1[:, None] * query_stride_1 + offs_d[None, :]) @@ -433,30 +435,44 @@ def kernel_unified_attention_3d( qq_bias_row_ptrs = (qq_bias_ptr + query_pos[:, None] * qq_bias_stride_0 ) # shape: [BLOCK_M] - num_blocks = cdiv_fn(seq_len, BLOCK_SIZE) + # compute the length of the longest sequence prefix spanned by any + # query token in the current q_block (q_block_local_idx) + max_seq_prefix_len = context_len + q_block_local_idx * BLOCK_Q + ( + BLOCK_M - 1) // num_queries_per_kv + 1 + + # adjust for potential padding in the last q_block by considering the + # actual sequence length + max_seq_prefix_len = tl.minimum(max_seq_prefix_len, seq_len) + + # calculate the number of tiles that need to be processed to + # cover the longest sequence prefix (due to causal masking, tiles beyond + # this prefix can be skipped) + num_tiles = cdiv_fn(max_seq_prefix_len, TILE_SIZE) # iterate through tiles within current segment for j in range( - segm_idx * blocks_per_segment, - min((segm_idx + 1) * blocks_per_segment, num_blocks), + segm_idx * tiles_per_segment, + min((segm_idx + 1) * tiles_per_segment, num_tiles), ): - physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j) + seq_offset = j * TILE_SIZE + offs_t + tile_mask = seq_offset < max_seq_prefix_len - offs_n = tl.arange(0, BLOCK_SIZE) + physical_block_idx = tl.load(block_tables_ptr + block_table_offset + + seq_offset // BLOCK_SIZE).to(tl.int64) - v_offset = (physical_block_idx * stride_v_cache_0 + + v_offset = (physical_block_idx[:, None] * stride_v_cache_0 + kv_head_idx * stride_v_cache_2 + offs_d[None, :] * stride_v_cache_3 + - offs_n[:, None] * stride_v_cache_1) + (seq_offset % BLOCK_SIZE)[:, None] * stride_v_cache_1) - k_offset = (physical_block_idx * stride_k_cache_0 + + k_offset = (physical_block_idx[None, :] * stride_k_cache_0 + kv_head_idx * stride_k_cache_2 + offs_d[:, None] * stride_k_cache_3 + - offs_n[None, :] * stride_k_cache_1) + (seq_offset % BLOCK_SIZE)[None, :] * stride_k_cache_1) - # K : (HEAD_SIZE, BLOCK_SIZE) + # K : (HEAD_SIZE, TILE_SIZE) K_load = tl.load(key_cache_ptr + k_offset, - mask=dim_mask[:, None], + mask=dim_mask[:, None] & tile_mask[None, :], other=0.0) if K_load.dtype.is_fp8(): @@ -467,9 +483,9 @@ def kernel_unified_attention_3d( else: K = K_load - # V : (BLOCK_SIZE, HEAD_SIZE) + # V : (TILE_SIZE, HEAD_SIZE) V_load = tl.load(value_cache_ptr + v_offset, - mask=dim_mask[None, :], + mask=dim_mask[None, :] & tile_mask[:, None], other=0.0) if V_load.dtype.is_fp8(): @@ -480,13 +496,10 @@ def kernel_unified_attention_3d( else: V = V_load - seq_offset = j * BLOCK_SIZE + offs_n - seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 - # S : (BLOCK_M, BLOCK_SIZE) - S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32) - + # S : (BLOCK_M, TILE_SIZE) + S = tl.zeros(shape=(BLOCK_M, TILE_SIZE), dtype=tl.float32) S += scale * tl.dot(Q, K) if USE_SOFTCAP: @@ -517,11 +530,12 @@ def kernel_unified_attention_3d( # compute running maximum # m_j : (BLOCK_M,) m_j = tl.maximum(M, tl.max(S, axis=1)) + # For sliding window there's a chance the max is -inf due to masking of # the entire row. In this case we need to set m_j 0 to avoid NaN m_j = tl.where(m_j > float("-inf"), m_j, 0.0) - # P : (BLOCK_M, BLOCK_SIZE,) + # P : (BLOCK_M, TILE_SIZE,) P = tl.exp(S - m_j[:, None]) # l_j : (BLOCK_M,) @@ -573,7 +587,7 @@ def reduce_segments( output_stride_0: tl.int64, # int output_stride_1: tl.int64, # int, should be equal to head_size block_table_stride: tl.int64, # int - BLOCK_SIZE: tl.constexpr, # int + TILE_SIZE: tl.constexpr, # int HEAD_SIZE: tl.constexpr, # int, must be power of 2 HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 query_start_len_ptr, # [num_seqs+1] @@ -594,10 +608,10 @@ def reduce_segments( # number of segments for this particular sequence num_segments = NUM_SEGMENTS_PER_SEQ - blocks_per_segment = cdiv_fn(seq_len, num_segments * BLOCK_SIZE) + tiles_per_segment = cdiv_fn(seq_len, num_segments * TILE_SIZE) # create masks for subsequent loads - act_num_segments = cdiv_fn(seq_len, blocks_per_segment * BLOCK_SIZE) + act_num_segments = cdiv_fn(seq_len, tiles_per_segment * TILE_SIZE) segm_mask = tl.arange(0, NUM_SEGMENTS_PER_SEQ) < tl.full( [NUM_SEGMENTS_PER_SEQ], act_num_segments, dtype=tl.int32) dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1, @@ -671,13 +685,10 @@ def unified_attention( # Optional tensor for sinks sinks=None, ): + assert causal, "Only causal attention is supported" assert q_descale is None, "Q scales not supported" - block_size = v.shape[1] - assert q.element_size() >= 2 or block_size >= 32, \ - "Block size must be at least 32 for fp8" - if sinks is not None: assert sinks.shape[0] == q.shape[1], \ "Sinks must be num_query_heads size" @@ -707,6 +718,12 @@ def unified_attention( # = floor(q.shape[0] / BLOCK_Q) + num_seqs total_num_q_blocks = q.shape[0] // BLOCK_Q + num_seqs + # Assigning default tile sizes for prefill and decode. + # Note: each tile size must be at least 32 for "fp8" (q.element_size() == 1) + # and at least 16 for all other data types. + TILE_SIZE_PREFILL = 32 + TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32 + # if batch contains a prefill if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128: kernel_unified_attention_2d[( @@ -736,6 +753,7 @@ def unified_attention( output_stride_1=out.stride(1), qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0, BLOCK_SIZE=block_size, + TILE_SIZE=TILE_SIZE_PREFILL, HEAD_SIZE=head_size, HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), USE_ALIBI_SLOPES=use_alibi_slopes, @@ -809,6 +827,7 @@ def unified_attention( query_stride_1=q.stride(1), qq_bias_stride_0=qq_bias.stride(0) if use_qq_bias else 0, BLOCK_SIZE=block_size, + TILE_SIZE=TILE_SIZE_DECODE, HEAD_SIZE=head_size, HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), USE_ALIBI_SLOPES=use_alibi_slopes, @@ -830,7 +849,6 @@ def unified_attention( BLOCK_M=BLOCK_M, NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS, ) - reduce_segments[(q.shape[0], num_query_heads)]( output_ptr=out, segm_output_ptr=segm_output, @@ -844,7 +862,7 @@ def unified_attention( output_stride_0=out.stride(0), output_stride_1=out.stride(1), block_table_stride=block_table.stride(0), - BLOCK_SIZE=block_size, + TILE_SIZE=TILE_SIZE_DECODE, HEAD_SIZE=head_size, HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), query_start_len_ptr=cu_seqlens_q, From 072d7e53e534d337b41262dd44ded9b44aa699ef Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Thu, 18 Sep 2025 18:27:49 +0400 Subject: [PATCH 37/58] [PERF] Add `conv1d` metadata to GDN attn (#25105) Signed-off-by: Vadim Gimpelson --- vllm/model_executor/layers/mamba/mamba2_metadata.py | 8 +++++--- vllm/model_executor/models/qwen3_next.py | 10 +++++++++- vllm/v1/attention/backends/gdn_attn.py | 6 ++++++ vllm/v1/attention/backends/mamba2_attn.py | 4 ++-- vllm/v1/attention/backends/short_conv_attn.py | 4 ++-- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py index 368bfe3af1d3f..c926e17a2c197 100644 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py @@ -11,6 +11,7 @@ from vllm.attention.backends.placeholder_attn import ( PlaceholderAttentionMetadata) from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.platforms import current_platform +from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata from vllm.v1.attention.backends.mamba2_attn import ( Mamba2AttentionMetadata, _query_start_loc_to_chunk_indices_offsets) @@ -45,8 +46,8 @@ class Mamba2Metadata: """ nums_dict: Optional[dict] = None cu_seqlen: Optional[int] = None - batch_ptr: Optional[torch.tensor] = None - token_chunk_offset_ptr: Optional[torch.tensor] = None + batch_ptr: Optional[torch.Tensor] = None + token_chunk_offset_ptr: Optional[torch.Tensor] = None def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]: @@ -117,7 +118,8 @@ def prepare_mamba2_metadata( def update_metadata(x: torch.Tensor, query_start_loc: torch.Tensor, mamba2_metadata: Union[Mamba2Metadata, - Mamba2AttentionMetadata]): + Mamba2AttentionMetadata, + GDNAttentionMetadata]): """ this is triggered upon handling a new input at the first layer """ diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index eb060cb90f44c..0c974ee44eee2 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -35,6 +35,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.abstract import MambaBase +from vllm.model_executor.layers.mamba.mamba2_metadata import update_metadata from vllm.model_executor.layers.mamba.mamba_mixer2 import ( mamba_v2_sharded_weight_loader) from vllm.model_executor.layers.mamba.mamba_utils import ( @@ -414,6 +415,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): assert isinstance(attn_metadata, dict) attn_metadata = attn_metadata[self.prefix] + conv_metadata = attn_metadata assert isinstance(attn_metadata, GDNAttentionMetadata) has_initial_state = attn_metadata.has_initial_state spec_query_start_loc = attn_metadata.spec_query_start_loc @@ -475,10 +477,15 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): # 2.2: process the remaining part if attn_metadata.num_prefills > 0: + mixed_qkv_non_spec_T = mixed_qkv_non_spec.transpose(0, 1) + if conv_metadata.cu_seqlen is None: + conv_metadata = update_metadata(mixed_qkv_non_spec_T, + non_spec_query_start_loc, + conv_metadata) # - "cache_indices" updates the conv_state cache in positions # pointed to by "mamba_cache_params.state_indices_tensor" mixed_qkv_non_spec = causal_conv1d_fn( - mixed_qkv_non_spec.transpose(0, 1), + mixed_qkv_non_spec_T, conv_weights, self.conv1d.bias, activation=self.activation, @@ -486,6 +493,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): has_initial_state=has_initial_state, cache_indices=non_spec_state_indices_tensor, query_start_loc=non_spec_query_start_loc, + metadata=conv_metadata, ).transpose(0, 1) elif attn_metadata.num_decodes > 0: mixed_qkv_non_spec = causal_conv1d_update( diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index ba89f93e8b56d..5dadc52d0fb1c 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -50,6 +50,12 @@ class GDNAttentionMetadata: Tensor] = None # shape: [num_prefill_tokens + num_decode_tokens,] num_accepted_tokens: Optional[torch.Tensor] = None # shape: [batch,] + # The following attributes are for triton implementation of causal_conv1d + nums_dict: Optional[dict] = None + cu_seqlen: Optional[int] = None + batch_ptr: Optional[torch.Tensor] = None + token_chunk_offset_ptr: Optional[torch.Tensor] = None + class GDNAttentionMetadataBuilder( AttentionMetadataBuilder[GDNAttentionMetadata]): diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index 359bad1ea9dee..2fe1f14ca1db0 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -132,8 +132,8 @@ class Mamba2AttentionMetadata: # The following attributes are for triton implementation of causal_conv1d nums_dict: Optional[dict] = None cu_seqlen: Optional[int] = None - batch_ptr: Optional[torch.tensor] = None - token_chunk_offset_ptr: Optional[torch.tensor] = None + batch_ptr: Optional[torch.Tensor] = None + token_chunk_offset_ptr: Optional[torch.Tensor] = None class Mamba2AttentionMetadataBuilder( diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py index f5ad65b02b4d4..717c40b37ecfb 100644 --- a/vllm/v1/attention/backends/short_conv_attn.py +++ b/vllm/v1/attention/backends/short_conv_attn.py @@ -34,8 +34,8 @@ class ShortConvAttentionMetadata: # For causal_conv1d nums_dict: Optional[dict] = None cu_seqlen: Optional[int] = None - batch_ptr: Optional[torch.tensor] = None - token_chunk_offset_ptr: Optional[torch.tensor] = None + batch_ptr: Optional[torch.Tensor] = None + token_chunk_offset_ptr: Optional[torch.Tensor] = None class ShortConvAttentionMetadataBuilder( From 67244c86f0f1ffc06fcab9cad5e78989695cc15f Mon Sep 17 00:00:00 2001 From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com> Date: Thu, 18 Sep 2025 22:29:40 +0800 Subject: [PATCH 38/58] feat(api): Return 503 on /health when engine is dead (#24897) Signed-off-by: dongbo910220 <1275604947@qq.com> Co-authored-by: Claude --- vllm/entrypoints/openai/api_server.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 93ea846f26f6c..912e664120929 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -103,6 +103,7 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs, is_valid_ipv6_address, set_ulimit) +from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -351,8 +352,11 @@ def engine_client(request: Request) -> EngineClient: @router.get("/health", response_class=Response) async def health(raw_request: Request) -> Response: """Health check.""" - await engine_client(raw_request).check_health() - return Response(status_code=200) + try: + await engine_client(raw_request).check_health() + return Response(status_code=200) + except EngineDeadError: + return Response(status_code=503) @router.get("/load") From 5f696c33b1fbf33fe91ecdd958874b9dd52f79b4 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 18 Sep 2025 23:22:01 +0800 Subject: [PATCH 39/58] [New Model] Support BertForTokenClassification / Named Entity Recognition (NER) task (#24872) Signed-off-by: wang.yuqi Signed-off-by: Isotr0py Co-authored-by: Isotr0py --- docs/models/supported_models.md | 11 +++ examples/offline_inference/pooling/README.md | 8 ++- examples/offline_inference/pooling/ner.py | 54 ++++++++++++++ examples/online_serving/pooling/README.md | 6 ++ examples/online_serving/pooling/ner.py | 71 +++++++++++++++++++ .../pooling/test_token_classification.py | 39 ++++++++++ tests/models/registry.py | 1 + vllm/entrypoints/llm.py | 4 ++ vllm/model_executor/models/bert.py | 52 ++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/v1/attention/backends/flex_attention.py | 12 +++- 11 files changed, 257 insertions(+), 2 deletions(-) create mode 100644 examples/offline_inference/pooling/ner.py create mode 100644 examples/online_serving/pooling/ner.py create mode 100644 tests/models/language/pooling/test_token_classification.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7aeaeca97699c..b67ebcbe3c81a 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -554,6 +554,17 @@ If your model is not in the above list, we will try to automatically convert the For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. +#### Token Classification + +These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) API. + +| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) | +|--------------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------| +| `BertForTokenClassification` | bert-based | `boltuix/NeuroBERT-NER` (see note), etc. | | | ✅︎ | + +!!! note + Named Entity Recognition (NER) usage, please refer to , . + [](){ #supported-mm-models } ## List of Multimodal Language Models diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md index 8693f5e08e0ba..79afbd9cfac47 100644 --- a/examples/offline_inference/pooling/README.md +++ b/examples/offline_inference/pooling/README.md @@ -26,8 +26,14 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py python examples/offline_inference/pooling/embed_matryoshka_fy.py ``` +## Named Entity Recognition (NER) usage + +```bash +python examples/offline_inference/pooling/ner.py +``` + ## Qwen3 reranker usage ```bash -python qwen3_reranker.py +python examples/offline_inference/pooling/qwen3_reranker.py ``` diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py new file mode 100644 index 0000000000000..f18742fac0d54 --- /dev/null +++ b/examples/offline_inference/pooling/ner.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER + +from argparse import Namespace + +from vllm import LLM, EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults( + model="boltuix/NeuroBERT-NER", + runner="pooling", + enforce_eager=True, + trust_remote_code=True, + ) + return parser.parse_args() + + +def main(args: Namespace): + # Sample prompts. + prompts = [ + "Barack Obama visited Microsoft headquarters in Seattle on January 2025." + ] + + # Create an LLM. + llm = LLM(**vars(args)) + tokenizer = llm.get_tokenizer() + label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label + + # Run inference + outputs = llm.encode(prompts) + + for prompt, output in zip(prompts, outputs): + logits = output.outputs.data + predictions = logits.argmax(dim=-1) + + # Map predictions to labels + tokens = tokenizer.convert_ids_to_tokens(output.prompt_token_ids) + labels = [label_map[p.item()] for p in predictions] + + # Print results + for token, label in zip(tokens, labels): + if token not in tokenizer.all_special_tokens: + print(f"{token:15} → {label}") + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index f7926542202d6..2c271b6a32bc2 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -12,6 +12,12 @@ python examples/online_serving/pooling/cohere_rerank_client.py python examples/online_serving/pooling/jinaai_rerank_client.py ``` +## Named Entity Recognition (NER) usage + +```bash +python examples/online_serving/pooling/ner.py +``` + ## Openai chat embedding for multimodal usage ```bash diff --git a/examples/online_serving/pooling/ner.py b/examples/online_serving/pooling/ner.py new file mode 100644 index 0000000000000..9ec2bd45a0fe5 --- /dev/null +++ b/examples/online_serving/pooling/ner.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from https://huggingface.co/boltuix/NeuroBERT-NER + +""" +Example online usage of Pooling API for Named Entity Recognition (NER). + +Run `vllm serve --runner pooling` +to start up the server in vLLM. e.g. + +vllm serve boltuix/NeuroBERT-NER +""" + +import argparse + +import requests +import torch + + +def post_http_request(prompt: dict, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + response = requests.post(api_url, headers=headers, json=prompt) + return response + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model", type=str, default="boltuix/NeuroBERT-NER") + + return parser.parse_args() + + +def main(args): + from transformers import AutoConfig, AutoTokenizer + + api_url = f"http://{args.host}:{args.port}/pooling" + model_name = args.model + + # Load tokenizer and config + tokenizer = AutoTokenizer.from_pretrained(model_name) + config = AutoConfig.from_pretrained(model_name) + label_map = config.id2label + + # Input text + text = "Barack Obama visited Microsoft headquarters in Seattle on January 2025." + prompt = {"model": model_name, "input": text} + + pooling_response = post_http_request(prompt=prompt, api_url=api_url) + + # Run inference + output = pooling_response.json()["data"][0] + logits = torch.tensor(output["data"]) + predictions = logits.argmax(dim=-1) + inputs = tokenizer(text, return_tensors="pt") + + # Map predictions to labels + tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) + labels = [label_map[p.item()] for p in predictions] + assert len(tokens) == len(predictions) + + # Print results + for token, label in zip(tokens, labels): + if token not in tokenizer.all_special_tokens: + print(f"{token:15} → {label}") + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py new file mode 100644 index 0000000000000..fd5e48a8b1449 --- /dev/null +++ b/tests/models/language/pooling/test_token_classification.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch +from transformers import AutoModelForTokenClassification + +from tests.models.utils import softmax + + +@pytest.mark.parametrize("model", ["boltuix/NeuroBERT-NER"]) +# The float32 is required for this tiny model to pass the test. +@pytest.mark.parametrize("dtype", ["float"]) +@torch.inference_mode +def test_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.encode(example_prompts) + + with hf_runner(model, + dtype=dtype, + auto_cls=AutoModelForTokenClassification) as hf_model: + tokenizer = hf_model.tokenizer + hf_outputs = [] + for prompt in example_prompts: + inputs = tokenizer([prompt], return_tensors="pt") + inputs = hf_model.wrap_device(inputs) + output = hf_model.model(**inputs) + hf_outputs.append(softmax(output.logits[0])) + + # check logits difference + for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): + hf_output = torch.tensor(hf_output).cpu().float() + vllm_output = torch.tensor(vllm_output).cpu().float() + assert torch.allclose(hf_output, vllm_output, 1e-2) diff --git a/tests/models/registry.py b/tests/models/registry.py index 93aa9d4025498..e9cc5170ade74 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -414,6 +414,7 @@ _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = { # [Cross-encoder] "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"), # noqa: E501 + "BertForTokenClassification": _HfExamplesInfo("boltuix/NeuroBERT-NER"), "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base", # noqa: E501 trust_remote_code=True, hf_overrides={ diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 63e9478612bb1..df6b16c73d6e7 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -943,6 +943,10 @@ class LLM: considered legacy and may be deprecated in the future. You should instead pass them via the `inputs` parameter. """ + + if self.supported_tasks == ["encode"] and pooling_task is None: + pooling_task = "encode" + if pooling_task is None: if "embed" in self.supported_tasks: pooling_task = "embed" diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index c07e5364814ac..ee32587f6b1b4 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -611,3 +611,55 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, positions=positions, inputs_embeds=inputs_embeds, intermediate_tensors=intermediate_tensors) + + +@default_pooling_type("ALL") +class BertForTokenClassification(nn.Module): + is_pooling_model = True + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + self.head_dtype = vllm_config.model_config.head_dtype + self.num_labels = config.num_labels + self.bert = BertModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "bert"), + embedding_class=BertEmbedding) + self.classifier = nn.Linear(config.hidden_size, + config.num_labels, + dtype=self.head_dtype) + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + }) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + loaded_params = loader.load_weights(weights) + return loaded_params + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + + if token_type_ids is not None: + assert self.bert.config.vocab_size < (1 << TOKEN_TYPE_SHIFT) + assert input_ids is not None + _encode_token_type_ids(input_ids, token_type_ids) + + hidden_states = self.bert(input_ids=input_ids, + positions=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors) + + hidden_states = hidden_states.to(self.head_dtype) + return self.classifier(hidden_states) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 707b57106e6d9..1382fd9e93ea3 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -193,6 +193,7 @@ _EMBEDDING_MODELS = { _CROSS_ENCODER_MODELS = { "BertForSequenceClassification": ("bert", "BertForSequenceClassification"), + "BertForTokenClassification": ("bert", "BertForTokenClassification"), "GteNewForSequenceClassification": ("bert_with_rope", "GteNewForSequenceClassification"), "ModernBertForSequenceClassification": ("modernbert", diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index cb983494216a7..662d3984554ad 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -720,6 +720,15 @@ class FlexAttentionImpl(AttentionImpl): (query, key, value), ) + query = query[:, :, :num_actual_tokens, :] + if ((key_tensor.size(-2) > num_actual_tokens) + or (value_tensor.size(-2) > num_actual_tokens)): + # In the encoder-only model with torch.compile, + # qkv might be padded, which might cause exception. + # see: https://github.com/vllm-project/vllm/pull/24872#discussion_r2353252290 + key_tensor = key_tensor[:, :, :num_actual_tokens, :] + value_tensor = value_tensor[:, :, :num_actual_tokens, :] + else: assert self.attn_type == AttentionType.DECODER key_cache, value_cache = kv_cache.unbind(0) @@ -744,7 +753,8 @@ class FlexAttentionImpl(AttentionImpl): (query, key_cache, value_cache), ) - query = query[:, :, :num_actual_tokens, :] + query = query[:, :, :num_actual_tokens, :] + # Doesn't work for now -> constraint violation # torch._dynamo.try_mark_dynamic(query, 2) From b419937c78017dc4c5bfa19f11547f4832ea2290 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= Date: Fri, 19 Sep 2025 00:23:26 +0900 Subject: [PATCH 40/58] [Docs] Fix warnings in mkdocs build (continued) (#25163) Signed-off-by: Zerohertz --- .../device_communicators/shm_object_storage.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py index 352e7525d4c84..0310fc14da256 100644 --- a/vllm/distributed/device_communicators/shm_object_storage.py +++ b/vllm/distributed/device_communicators/shm_object_storage.py @@ -253,7 +253,7 @@ class SingleWriterShmRingBuffer: Args: nbytes (int, optional): The size of the buffer to free. If None, - frees the maximum size of the ring buffer. + frees the maximum size of the ring buffer. ''' assert self.is_writer, "Only the writer can free buffers." diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index d391cc50ad232..4eb1f8b89d64f 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -697,9 +697,7 @@ class OpenAIServing: add_special_tokens: bool = True, ) -> TextTokensPrompt: """ - A simpler implementation of - [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs] - that assumes single input. + A simpler implementation that tokenizes a single prompt input. """ async for result in self._tokenize_prompt_inputs_async( request, @@ -718,9 +716,7 @@ class OpenAIServing: add_special_tokens: bool = True, ) -> AsyncGenerator[TextTokensPrompt, None]: """ - A simpler implementation of - [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs] - that assumes multiple inputs. + A simpler implementation that tokenizes multiple prompt inputs. """ for prompt in prompt_inputs: if isinstance(prompt, str): From 2ea50e977aac00c63e78990a7477bb91295df183 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Thu, 18 Sep 2025 10:52:58 -0500 Subject: [PATCH 41/58] Enable Allgather/ReduceScatter backend for NaiveAllToAll (#23964) Signed-off-by: Shu Wang. Signed-off-by: Tyler Michael Smith Signed-off-by: Shu Wang Co-authored-by: Tyler Michael Smith Co-authored-by: Tyler Michael Smith Co-authored-by: Michael Goin --- .../device_communicators/all2all.py | 39 +++++++++++++++++++ .../device_communicators/cuda_communicator.py | 4 ++ vllm/envs.py | 17 +++++--- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 427fd040fcb71..149df73d8667b 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -5,6 +5,7 @@ from typing import Any import torch import torch.distributed as dist +from vllm.distributed import get_dp_group from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.utils import has_deep_ep, has_pplx @@ -69,6 +70,44 @@ class NaiveAll2AllManager(All2AllManagerBase): pass +class AgRsAll2AllManager(All2AllManagerBase): + """ + An implementation of all2all communication based on + all-gather (dispatch) and reduce-scatter (combine). + """ + + def __init__(self, cpu_group): + super().__init__(cpu_group) + + def dispatch(self, hidden_states: torch.Tensor, + router_logits: torch.Tensor): + """ + Gather hidden_states and router_logits from all dp ranks. + """ + sizes = get_forward_context( + ).dp_metadata.get_chunk_sizes_across_dp_rank() + hidden_states, router_logits = get_dp_group().all_gatherv( + [hidden_states, router_logits], + dim=0, + sizes=sizes, + ) + return hidden_states, router_logits + + def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Reduce-scatter hidden_states across all dp ranks. + """ + sizes = get_forward_context( + ).dp_metadata.get_chunk_sizes_across_dp_rank() + hidden_states = get_dp_group().reduce_scatterv(hidden_states, + dim=0, + sizes=sizes) + return hidden_states + + def destroy(self): + pass + + class PPLXAll2AllManager(All2AllManagerBase): """ All2All communication based on PPLX kernels. diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 78c90b006ffc8..b2bf3bc3cc2ed 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -87,6 +87,10 @@ class CudaCommunicator(DeviceCommunicatorBase): from .all2all import NaiveAll2AllManager self.all2all_manager = NaiveAll2AllManager(self.cpu_group) logger.info("Using naive all2all manager.") + elif all2all_backend == "allgather_reducescatter": + from .all2all import AgRsAll2AllManager + self.all2all_manager = AgRsAll2AllManager(self.cpu_group) + logger.info("Using AllGather-ReduceScatter all2all manager.") elif all2all_backend == "pplx": from .all2all import PPLXAll2AllManager self.all2all_manager = PPLXAll2AllManager(self.cpu_group) diff --git a/vllm/envs.py b/vllm/envs.py index 72e1d5b0ede81..19e2f8635275d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -149,8 +149,11 @@ if TYPE_CHECKING: VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 - VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx", "deepep_high_throughput", - "deepep_low_latency"] = "naive" + VLLM_ALL2ALL_BACKEND: Literal["naive", "pplx", + "deepep_high_throughput", + "deepep_low_latency", + "allgather_reducescatter"] = \ + "allgather_reducescatter" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 VLLM_SLEEP_WHEN_IDLE: bool = False @@ -1124,14 +1127,18 @@ environment_variables: dict[str, Callable[[], Any]] = { # all2all backend for vllm's expert parallel communication # Available options: - # - "naive": naive all2all implementation using all-reduce + # - "naive": naive all2all implementation using broadcasts + # - "allgather_reducescatter": all2all implementation based on allgather and + # reducescatter # - "pplx": use pplx kernels # - "deepep_high_throughput", use deepep high-throughput kernels # - "deepep_low_latency", use deepep low-latency kernels "VLLM_ALL2ALL_BACKEND": - env_with_choices("VLLM_ALL2ALL_BACKEND", "naive", + env_with_choices("VLLM_ALL2ALL_BACKEND", "allgather_reducescatter", ["naive", "pplx", - "deepep_high_throughput", "deepep_low_latency"]), + "deepep_high_throughput", + "deepep_low_latency", + "allgather_reducescatter"]), # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. # Both require compute capability 10.0 or above. From 1c3b1634aa9d4be56fa6e931e96ec8145fedcc0a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:01:50 +0100 Subject: [PATCH 42/58] [Misc] Add codeowner for Transformers backend (#25180) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/CODEOWNERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b8d6db06548d5..08717cdde643a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -61,6 +61,10 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/v1/kv_connector @ApostaC /tests/v1/offloading @ApostaC +# Transformers backend +/vllm/model_executor/models/transformers.py @hmellor +/tests/models/test_transformers.py @hmellor + # Docs /docs @hmellor mkdocs.yaml @hmellor From c4cb0af98a8e39950fa9b99acf7c241959a14ac8 Mon Sep 17 00:00:00 2001 From: qizixi <22851944+zixi-qi@users.noreply.github.com> Date: Thu, 18 Sep 2025 09:12:19 -0700 Subject: [PATCH 43/58] [spec decode] Fix MTP inference path for MiMo-7B model (#25136) Signed-off-by: zixi-qi Co-authored-by: Cyrus Leung --- examples/offline_inference/spec_decode.py | 6 +++++- vllm/config/speculative.py | 2 +- vllm/model_executor/models/mimo_mtp.py | 18 ++++++++++++++---- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 5af232cb6af6a..004e75b204642 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -53,7 +53,6 @@ def parse_args(): "--method", type=str, default="eagle", - choices=["ngram", "eagle", "eagle3", "mtp"], ) parser.add_argument("--num-spec-tokens", type=int, default=2) parser.add_argument("--prompt-lookup-max", type=int, default=5) @@ -118,6 +117,11 @@ def main(): "prompt_lookup_max": args.prompt_lookup_max, "prompt_lookup_min": args.prompt_lookup_min, } + elif args.method.endswith("mtp"): + speculative_config = { + "method": args.method, + "num_speculative_tokens": args.num_spec_tokens, + } else: raise ValueError(f"unknown method: {args.method}") diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index fca8c28e5c61e..2c861723c3966 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -31,7 +31,7 @@ logger = init_logger(__name__) SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", "mlp_speculator", "draft_model", "deepseek_mtp", - "ernie_mtp", "qwen3_next_mtp"] + "ernie_mtp", "qwen3_next_mtp", "mimo_mtp"] @config diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index ac835edc001ea..09194e9f95d0e 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -241,6 +241,15 @@ class MiMoMTP(nn.Module): def map_model_name_to_mtp_param_name(self, name: str) -> str: import regex as re + + # append mtp_start_layer_idx + pattern = r"(model\.mtp_layers\.)(\d+)(\.)" + match = re.match(pattern, name) + if match: + original_num = int(match.group(2)) + new_num = original_num + self.config.num_hidden_layers + name = name.replace(match.group(), f"{match.group(1)}{new_num}.") + # check for early turn name_without_prefix = [ "token_layernorm", "hidden_layernorm", "input_proj", "final_layernorm" @@ -248,10 +257,11 @@ class MiMoMTP(nn.Module): for sub_name in name_without_prefix: if sub_name in name: return name - pattern = r"model.mtp_layers.(\d+)." - group = re.match(pattern, name) - if group is not None: - name = name.replace(group.group(), group.group() + "mtp_block.") + # add mtp_block + pattern = r"(model\.mtp_layers\.\d+\.)" + match = re.match(pattern, name) + if match: + name = name.replace(match.group(), match.group() + "mtp_block.") return name def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str: From dc3405936090f5c964a5b38c9de8c8400f01541c Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:36:55 -0400 Subject: [PATCH 44/58] [ROCm][CI/Build] Use ROCm7.0 as the base (#25178) Signed-off-by: Gregory Shtrasberg --- docker/Dockerfile.rocm | 5 ++- docker/Dockerfile.rocm_base | 61 ++++++++----------------------------- 2 files changed, 16 insertions(+), 50 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 063fc49693288..c8900212e5a1b 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -29,7 +29,10 @@ ARG VLLM_BRANCH="main" ONBUILD RUN git clone ${VLLM_REPO} \ && cd vllm \ && git fetch -v --prune -- origin ${VLLM_BRANCH} \ - && git checkout FETCH_HEAD + && git checkout FETCH_HEAD \ + && if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \ + git remote add upstream "https://github.com/vllm-project/vllm.git" \ + && git fetch upstream ; fi FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm # ----------------------- diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 2ba5461dfe551..4973b57f76563 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,25 +1,23 @@ -ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete -ARG HIPBLASLT_BRANCH="aa0bda7b" -ARG HIPBLAS_COMMON_BRANCH="9b80ba8e" -ARG LEGACY_HIPBLASLT_OPTION= -ARG TRITON_BRANCH="e5be006" -ARG TRITON_REPO="https://github.com/triton-lang/triton.git" -ARG PYTORCH_BRANCH="f717b2af" -ARG PYTORCH_VISION_BRANCH="v0.21.0" +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete +ARG TRITON_BRANCH="f9e5bf54" +ARG TRITON_REPO="https://github.com/ROCm/triton.git" +ARG PYTORCH_BRANCH="b2fb6885" +ARG PYTORCH_VISION_BRANCH="v0.23.0" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" -ARG FA_BRANCH="1a7f4dfa" +ARG FA_BRANCH="0e60e394" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="4822e675" +ARG AITER_BRANCH="2ab9f4cd" ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base -ENV PATH=/opt/rocm/llvm/bin:$PATH +ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV ROCM_PATH=/opt/rocm ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: -ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201 +ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} +ENV AITER_ROCM_ARCH=gfx942;gfx950 ARG PYTHON_VERSION=3.12 @@ -45,29 +43,6 @@ RUN apt-get update -y \ RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython -FROM base AS build_hipblaslt -ARG HIPBLASLT_BRANCH -ARG HIPBLAS_COMMON_BRANCH -# Set to "--legacy_hipblas_direct" for ROCm<=6.2 -ARG LEGACY_HIPBLASLT_OPTION -RUN git clone https://github.com/ROCm/hipBLAS-common.git -RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y -RUN cd hipBLAS-common \ - && git checkout ${HIPBLAS_COMMON_BRANCH} \ - && mkdir build \ - && cd build \ - && cmake .. \ - && make package \ - && dpkg -i ./*.deb -RUN git clone https://github.com/ROCm/hipBLASLt -RUN cd hipBLASLt \ - && git checkout ${HIPBLASLT_BRANCH} \ - && apt-get install -y llvm-dev \ - && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \ - && cd build/release \ - && make package -RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install - FROM base AS build_triton ARG TRITON_BRANCH ARG TRITON_REPO @@ -121,13 +96,11 @@ RUN cd aiter \ && git checkout ${AITER_BRANCH} \ && git submodule update --init --recursive \ && pip install -r requirements.txt -RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl +RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install FROM base AS debs RUN mkdir /app/debs -RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ - cp /install/*.deb /app/debs RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ cp /install/*.whl /app/debs RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ @@ -138,11 +111,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \ cp /install/*.whl /app/debs FROM base AS final -RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ - dpkg -i /install/*deb \ - && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \ - && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \ - && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ pip install /install/*.whl RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ @@ -153,9 +121,6 @@ RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \ pip install /install/*.whl ARG BASE_IMAGE -ARG HIPBLAS_COMMON_BRANCH -ARG HIPBLASLT_BRANCH -ARG LEGACY_HIPBLASLT_OPTION ARG TRITON_BRANCH ARG TRITON_REPO ARG PYTORCH_BRANCH @@ -167,9 +132,6 @@ ARG FA_REPO ARG AITER_BRANCH ARG AITER_REPO RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ - && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ - && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ - && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \ && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \ && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \ && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \ @@ -177,5 +139,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ + && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ No newline at end of file From bbdc0f2366997536207abc212fcdae7a1b688159 Mon Sep 17 00:00:00 2001 From: Rohan Potdar <66227218+Rohan138@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:46:47 -0500 Subject: [PATCH 45/58] [ROCm][AITER][Bugfix] Switch AITER to use PIECEWISE_AND_FULL compilation (#25104) Signed-off-by: Rohan138 --- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 8eb3505cf274d..afb2283c44d37 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -232,7 +232,7 @@ class AiterFlashAttentionMetadata: class AiterFlashAttentionMetadataBuilder( AttentionMetadataBuilder[AiterFlashAttentionMetadata]): - cudagraph_support = AttentionCGSupport.ALWAYS + cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): From 505805b645649be6a8e788a1f048b851fa123ef1 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Thu, 18 Sep 2025 20:57:07 +0300 Subject: [PATCH 46/58] [KV offload][1/N] Introduce an offloading component (#19848) Signed-off-by: Or Ozeri --- .buildkite/test-pipeline.yaml | 1 + tests/v1/offloading/test_worker.py | 152 +++++++++++++++++++++++++ vllm/v1/offloading/abstract.py | 165 ++++++++++++++++++++++++++++ vllm/v1/offloading/mediums.py | 39 +++++++ vllm/v1/offloading/worker/worker.py | 142 ++++++++++++++++++++++++ 5 files changed, 499 insertions(+) create mode 100644 tests/v1/offloading/test_worker.py create mode 100644 vllm/v1/offloading/abstract.py create mode 100644 vllm/v1/offloading/mediums.py create mode 100644 vllm/v1/offloading/worker/worker.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 66dfc990805f2..5fd08296625ad 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -280,6 +280,7 @@ steps: # split the test to avoid interference - pytest -v -s v1/core - pytest -v -s v1/executor + - pytest -v -s v1/offloading - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker diff --git a/tests/v1/offloading/test_worker.py b/tests/v1/offloading/test_worker.py new file mode 100644 index 0000000000000..2391b565773aa --- /dev/null +++ b/tests/v1/offloading/test_worker.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.v1.offloading.abstract import LoadStoreSpec +from vllm.v1.offloading.worker.worker import (OffloadingHandler, + OffloadingWorker, TransferResult, + TransferSpec) + + +class LoadStoreSpec1(LoadStoreSpec): + + def __init__(self, + submit_success: bool = True, + async_success: bool = True, + exception: bool = False): + self.finished = False + self.submit_success = submit_success + self.async_success = async_success + self.exception = exception + + @staticmethod + def medium() -> str: + return "1" + + def __repr__(self): + return f"{self.medium()}: {id(self)}" + + +class LoadStoreSpec2(LoadStoreSpec): + + @staticmethod + def medium() -> str: + return "2" + + def __repr__(self): + return f"{self.medium()}: {id(self)}" + + +class OffloadingHandler1To2(OffloadingHandler): + + def __init__(self): + self.transfers: dict[int, LoadStoreSpec1] = {} + + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + src, dst = spec + assert isinstance(src, LoadStoreSpec1) + assert isinstance(dst, LoadStoreSpec2) + + if src.exception: + raise Exception("An expected exception. Don't worry!") + if not src.submit_success: + return False + + self.transfers[job_id] = src + return True + + def get_finished(self) -> list[TransferResult]: + finished = [] + for job_id, spec in list(self.transfers.items()): + if spec.finished: + finished.append((job_id, spec.async_success)) + del self.transfers[job_id] + return finished + + +class OffloadingHandler2To1(OffloadingHandler): + + def __init__(self): + self.transfers: dict[int, LoadStoreSpec1] = {} + + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + src, dst = spec + assert isinstance(src, LoadStoreSpec2) + assert isinstance(dst, LoadStoreSpec1) + + self.transfers[job_id] = dst + return True + + def get_finished(self) -> list[TransferResult]: + finished = [] + for job_id, spec in list(self.transfers.items()): + if spec.finished: + finished.append((job_id, spec.async_success)) + del self.transfers[job_id] + return finished + + +def test_offloading_worker(): + """ + Tests OffloadingWorker with 2 handlers. + One handler performs 1->2 transfers, and the other handles 2->1. + """ + worker = OffloadingWorker() + handler1to2 = OffloadingHandler1To2() + handler2to1 = OffloadingHandler2To1() + worker.register_handler(LoadStoreSpec1, LoadStoreSpec2, handler1to2) + worker.register_handler(LoadStoreSpec2, LoadStoreSpec1, handler2to1) + + # 1st transfer 1->2 (exception) + src1 = LoadStoreSpec1(exception=True) + dst1 = LoadStoreSpec2() + assert not worker.transfer_async(1, (src1, dst1)) + + # 2ed transfer 1->2 (failure to submit) + src2 = LoadStoreSpec1(submit_success=False) + dst2 = LoadStoreSpec2() + assert not worker.transfer_async(2, (src2, dst2)) + + # 3rd transfer 1->2 (failure) + src3 = LoadStoreSpec1(async_success=False) + dst3 = LoadStoreSpec2() + assert worker.transfer_async(3, (src3, dst3)) + + # 4th transfer 1->2 (success) + src4 = LoadStoreSpec1() + dst4 = LoadStoreSpec2() + worker.transfer_async(4, (src4, dst4)) + assert set(handler1to2.transfers.keys()) == {3, 4} + + # 5th transfer 2->1 + src5 = LoadStoreSpec2() + dst5 = LoadStoreSpec1() + worker.transfer_async(5, (src5, dst5)) + assert set(handler2to1.transfers.keys()) == {5} + + # no transfer completed yet + assert worker.get_finished() == [] + + # complete 3rd, 4th + src3.finished = True + src4.finished = True + + # 6th transfer 1->2 + src6 = LoadStoreSpec1() + dst6 = LoadStoreSpec2() + worker.transfer_async(6, (src6, dst6)) + + # 7th transfer 2->1 + src7 = LoadStoreSpec2() + dst7 = LoadStoreSpec1() + worker.transfer_async(7, (src7, dst7)) + + # 6th and 7th transfers started + assert 6 in handler1to2.transfers + assert 7 in handler2to1.transfers + + # verify result of 3rd and 4th transfers + assert (sorted(worker.get_finished()) == [(3, False), (4, True)]) + + # complete 6th and 7th transfers + src6.finished = True + dst7.finished = True + assert (sorted(worker.get_finished()) == [(6, True), (7, True)]) diff --git a/vllm/v1/offloading/abstract.py b/vllm/v1/offloading/abstract.py new file mode 100644 index 0000000000000..9f9c044ea1c53 --- /dev/null +++ b/vllm/v1/offloading/abstract.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +OffloadingManager class for managing KV data offloading in vLLM v1 + +This class runs in the scheduler, tracks which blocks are offloaded +and their address. + +The class provides the following primitives: + lookup() - find the length of the maximal series of blocks, + starting from the first one, that are all offloaded. + prepare_load() - prepare given blocks to be read. + The given blocks will be protected from eviction. + This function returns a LoadSpec which encapsulates + information required for performing the load. + touch() - marks the give blocks as recently used. Can be used + to track block's LRU. This function is separated from the + prepare_load function to allow setting block recency even + for blocks which do not need reading from the cache, such as + blocks that are cached by the GPU prefix cache. + complete_load() - mark blocks which were previously prepared to be + loaded as done loading. This is to re-allow their eviction. + prepare_store() - prepare the given blocks to be written. + Returns a StoreSpec encapsulating offloading information, + as well as a list of blocks that were evicted as a result. + complete_store() - marks a previous store as completed. + Following this call, the given blocks will become loadable. +""" + +from abc import ABC, abstractmethod +from collections.abc import Iterable +from dataclasses import dataclass +from typing import Optional + +from vllm.v1.core.kv_cache_utils import BlockHash + + +class LoadStoreSpec(ABC): + """ + Abstract metadata that encapsulates information allowing a worker + to load, and optionally also to store, blocks of KV data. + """ + + @staticmethod + @abstractmethod + def medium() -> str: + """ + Returns a string representation of the medium type + this store/load targets. + """ + pass + + +@dataclass +class PrepareStoreOutput: + block_hashes_to_store: list[BlockHash] + store_spec: LoadStoreSpec + block_hashes_evicted: list[BlockHash] + + +@dataclass +class OffloadingEvent: + block_hashes: list[BlockHash] + block_size: int + medium: str + # True if blocks are removed, False if stored + removed: bool + + +class OffloadingManager(ABC): + + @abstractmethod + def lookup(self, block_hashes: Iterable[BlockHash]) -> int: + """ + Finds the length of the maximal series of blocks, starting from the + first one, that are all offloaded. + + Args: + block_hashes: the hashes identifying the blocks to lookup. + + Returns: + An integer representing the maximal number of blocks that + are currently offloaded. + """ + pass + + @abstractmethod + def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec: + """ + Prepare the given blocks to be read. + The given blocks will be protected from eviction until + complete_load is called. + It assumes all given blocks are offloaded. + + Args: + block_hashes: the hashes identifying the blocks. + + Returns: + A LoadStoreSpec that can be used by a worker to locate and load + the actual offloaded KV data. + """ + pass + + def touch(self, block_hashes: Iterable[BlockHash]): + """ + Mark the given blocks as recently used. + This could in practice mean moving them to the end of an LRU list. + + Args: + block_hashes: the hashes identifying the blocks. + """ + return + + def complete_load(self, block_hashes: Iterable[BlockHash]): + """ + Marks previous blocks that were prepared to load as done loading. + + Args: + block_hashes: the hashes identifying the blocks. + """ + return + + @abstractmethod + def prepare_store( + self, + block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]: + """ + Prepare the given blocks to be offloaded. + The given blocks will be protected from eviction until + complete_store is called. + + Args: + block_hashes: the hashes identifying the blocks. + + Returns: + A PrepareStoreOutput indicating which blocks need storing, + where to store them (LoadStoreSpec), and list of blocks that + were evicted as a result. + None is returned if the blocks cannot be stored. + """ + pass + + def complete_store(self, + block_hashes: Iterable[BlockHash], + success: bool = True): + """ + Marks blocks which were previously prepared to be stored, as stored. + Following this call, the blocks become loadable. + If if_success is False, blocks that were not marked as stored will be + removed. + + Args: + block_hashes: the hashes identifying the blocks. + success: whether the blocks were stored successfully. + """ + return + + def take_events(self) -> Iterable[OffloadingEvent]: + """ + Take the offloading events from the manager. + + Yields: + New OffloadingEvents collected since the last call. + """ + return () diff --git a/vllm/v1/offloading/mediums.py b/vllm/v1/offloading/mediums.py new file mode 100644 index 0000000000000..5a1887848c9fc --- /dev/null +++ b/vllm/v1/offloading/mediums.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC + +import numpy as np + +from vllm.v1.offloading.abstract import LoadStoreSpec + + +class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC): + """ + Spec for loading/storing KV blocks from given block numbers. + """ + + def __init__(self, block_ids: list[int]): + self.block_ids = np.array(block_ids, dtype=np.int64) + + def __repr__(self) -> str: + return repr(self.block_ids) + + +class GPULoadStoreSpec(BlockIDsLoadStoreSpec): + """ + Spec for loading/storing a KV block to GPU memory. + """ + + @staticmethod + def medium() -> str: + return "GPU" + + +class CPULoadStoreSpec(BlockIDsLoadStoreSpec): + """ + Spec for loading/storing a KV block to CPU memory. + """ + + @staticmethod + def medium() -> str: + return "CPU" diff --git a/vllm/v1/offloading/worker/worker.py b/vllm/v1/offloading/worker/worker.py new file mode 100644 index 0000000000000..d2c2045d1f1f6 --- /dev/null +++ b/vllm/v1/offloading/worker/worker.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod + +from vllm.logger import init_logger +from vllm.v1.offloading.abstract import LoadStoreSpec + +# a single transfer spec (src_blocks_spec, dst_blocks_spec) +TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec] +# transfers are forwarded to workers by (src_medium, dst_medium) +TransferType = tuple[str, str] +# transfer result (job_id, success) +TransferResult = tuple[int, bool] + +logger = init_logger(__name__) + + +class OffloadingHandler(ABC): + """ + OffloadingHandler class for managing asynchronous KV data transfers + + This class runs in the worker. + It kicks off async KV data transfer requests, and allows + collecting back completion statuses. + + The class provides the following primitives: + transfer_async() - kicks off a new transfer job + get_finished() - returns a list of newly finished job IDs. + """ + + @abstractmethod + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + """ + Initiates an asynchronous transfer of KV data. + + Args: + job_id: a unique ID that will be used when notifying back on + transfer completion. + spec: the (src, dst) spec of the KV data transfer. + + Returns: + True if transfer was submitted successfully. + """ + pass + + @abstractmethod + def get_finished(self) -> list[TransferResult]: + """ + Get transfers finished since last call. + + Returns: + A list of (job_id, success) of transfers. + """ + pass + + +class OffloadingWorker: + """ + OffloadingWorker class for managing asynchronous KV data transfers + using multiple OffloadingHandlers + + This class runs in the worker. + It kicks off async KV data transfer requests, by delegating + to one of its registered OffloadingHandlers, based on the transfer type. + + The class provides the following primitives: + register_handler() - registers a new handler to handle + a specific transfer type + transfer_async() - kicks off a new transfer job + using one of the registered handlers. + get_finished() - returns a list of newly finished job IDs + from all handlers. + """ + + def __init__(self): + self.handlers: set[OffloadingHandler] = set() + self.transfer_type_to_handler: dict[TransferType, + OffloadingHandler] = {} + + def register_handler(self, src_cls: type[LoadStoreSpec], + dst_cls: type[LoadStoreSpec], + handler: OffloadingHandler) -> None: + """ + Registers a new handler. + + Args: + src_cls: the source type of transfers handled by this handler. + dst_cls: the destination type of transfers handled by this handler. + handler: the handler that will handle transfers. + """ + transfer_type = (src_cls.medium(), dst_cls.medium()) + assert transfer_type not in self.transfer_type_to_handler + self.handlers.add(handler) + self.transfer_type_to_handler[transfer_type] = handler + + def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: + """ + Initiates an asynchronous transfer of KV data. + + Args: + job_id: a unique ID that will be used when notifying back on + transfer completion. + spec: the (src, dst) spec of the KV data transfer. + + Returns: + True if transfer was submitted successfully. + """ + src, dst = spec + transfer_type = (src.medium(), dst.medium()) + handler = self.transfer_type_to_handler.get(transfer_type) + assert handler is not None + + try: + success = handler.transfer_async(job_id, spec) + except Exception as e: + logger.warning("Exception in %r transfer %d: %r", + transfer_type, + job_id, + e, + exc_info=True) + return False + + if not success: + logger.warning("Failed to submit %r transfer %d", transfer_type, + job_id) + else: + logger.debug("Submitted %r transfer %d: %r", transfer_type, job_id, + spec) + + return success + + def get_finished(self) -> list[TransferResult]: + """ + Get transfers finished since last call. + + Returns: + A list of (job_id, success) of transfers. + """ + finished = [] + for handler in self.handlers: + finished.extend(handler.get_finished()) + return finished From e19bce40a1660cb7c03b790d0b000db155cf925d Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 18 Sep 2025 11:07:42 -0700 Subject: [PATCH 47/58] [V0 Deprecation] Remove AsyncLLMEngine (#25025) Signed-off-by: Woosuk Kwon Signed-off-by: Woosuk Kwon --- tests/entrypoints/openai/test_chat.py | 54 +- tests/entrypoints/openai/test_completion.py | 830 ------------- .../test_completion_with_prompt_embeds.py | 3 + .../entrypoints/openai/test_lora_adapters.py | 5 +- tests/entrypoints/openai/test_metrics.py | 2 +- .../openai/test_return_tokens_as_ids.py | 26 +- .../entrypoints/openai/test_skip_tokenizer.py | 8 - tests/v1/test_oracle.py | 18 - vllm/engine/async_llm_engine.py | 1030 +---------------- vllm/entrypoints/launcher.py | 2 - vllm/entrypoints/openai/api_server.py | 65 +- 11 files changed, 76 insertions(+), 1967 deletions(-) delete mode 100644 tests/entrypoints/openai/test_completion.py diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index a827f94cfbfe5..3bdfef7b4adbc 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -28,11 +28,9 @@ def monkeypatch_module(): mpatch.undo() -@pytest.fixture(scope="module", params=[False, True]) -def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811 - - use_v1 = request.param - monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') +@pytest.fixture(scope="module") +def server(monkeypatch_module, zephyr_lora_files): #noqa: F811 + monkeypatch_module.setenv('VLLM_USE_V1', '1') args = [ # use half precision for speed and memory savings in CI environment @@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811 yield remote_server -@pytest.fixture -def is_v1_server(server): - import os - assert os.environ['VLLM_USE_V1'] in ['0', '1'] - return os.environ['VLLM_USE_V1'] == '1' - - @pytest_asyncio.fixture async def client(server): async with server.get_async_client() as async_client: @@ -481,10 +472,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio async def test_structured_outputs_choice_chat( - client: openai.AsyncOpenAI, sample_structured_outputs_choices, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Structured outputs is only supported in v1 engine") + client: openai.AsyncOpenAI, + sample_structured_outputs_choices, +): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -522,12 +512,10 @@ async def test_structured_outputs_choice_chat( @pytest.mark.asyncio -async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, - sample_json_schema, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Structured outputs is only supported in v1 engine") - +async def test_structured_outputs_json_chat( + client: openai.AsyncOpenAI, + sample_json_schema, +): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -569,10 +557,10 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI, - sample_regex, is_v1_server: bool): - if not is_v1_server: - pytest.skip("Structured outputs is only supported in v1 engine") +async def test_structured_outputs_regex_chat( + client: openai.AsyncOpenAI, + sample_regex, +): messages = [{ "role": "system", @@ -660,10 +648,10 @@ async def test_structured_outputs_choice_chat_logprobs( @pytest.mark.asyncio -async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("Tool use is only supported in v1 engine") +async def test_named_tool_use( + client: openai.AsyncOpenAI, + sample_json_schema, +): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -821,11 +809,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_response_format_json_schema(client: openai.AsyncOpenAI, - is_v1_server: bool): - if not is_v1_server: - pytest.skip( - "JSON schema response format is only supported in v1 engine") +async def test_response_format_json_schema(client: openai.AsyncOpenAI): prompt = 'what is 1+1? The format is "result": 2' # Check that this prompt cannot lead to a valid JSON without json_schema for _ in range(2): diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py deleted file mode 100644 index 0347513befe32..0000000000000 --- a/tests/entrypoints/openai/test_completion.py +++ /dev/null @@ -1,830 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# imports for structured outputs tests -import json -import os -from typing import Optional - -import jsonschema -import openai # use the official client for correctness check -import pytest -import pytest_asyncio -import regex as re -import requests -# downloading lora to test lora requests -from openai import BadRequestError - -from vllm.transformers_utils.tokenizer import get_tokenizer - -from ...utils import RemoteOpenAIServer - -# any model with a chat template should work here -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically these adapters use a different base model, -# but we're not testing generation quality here - - -@pytest.fixture(scope="module") -def default_server_args(zephyr_lora_files): - return [ - # use half precision for speed and memory savings in CI environment - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--max-num-seqs", - "128", - "--enforce-eager", - # lora config - "--enable-lora", - "--lora-modules", - f"zephyr-lora={zephyr_lora_files}", - "--max-lora-rank", - "64", - "--max-cpu-loras", - "2", - ] - - -@pytest.fixture(scope="module", - params=["", "--disable-frontend-multiprocessing"]) -def server(default_server_args, request): - if request.param: - default_server_args.append(request.param) - - original_value = os.environ.get('VLLM_USE_V1') - os.environ['VLLM_USE_V1'] = '0' - try: - with RemoteOpenAIServer(MODEL_NAME, - default_server_args) as remote_server: - yield remote_server - finally: - # Restore original env value - if original_value is None: - os.environ.pop('VLLM_USE_V1', None) - else: - os.environ['VLLM_USE_V1'] = original_value - - -@pytest.fixture -def is_v1_server(server): - import os - - # For completion tests, we assume v0 since there's no explicit v1 setup - return os.environ.get('VLLM_USE_V1', '0') == '1' - - -@pytest_asyncio.fixture -async def client(server): - async with server.get_async_client() as async_client: - yield async_client - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): - completion = await client.completions.create(model=model_name, - prompt="Hello, my name is", - max_tokens=5, - temperature=0.0) - - assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - - choice = completion.choices[0] - assert len(choice.text) >= 5 - assert choice.finish_reason == "length" - assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) - - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 1 - assert completion.choices[0].prompt_logprobs is None - - -@pytest.mark.asyncio -async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): - # test using token IDs - with pytest.raises(openai.BadRequestError, match="out of vocabulary"): - # Added tokens should be rejected by the base model - await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 32000, 32001, 32002], - echo=True, - max_tokens=5, - temperature=0.0, - ) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=None, - ) - choice = completion.choices[0] - assert choice.logprobs is None - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # just test 1 lora - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=0, - ) - choice = completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is not None - assert len(choice.logprobs.top_logprobs[0]) == 1 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): - # test using token IDs - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - logprobs=5, - ) - choice = completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is not None - assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6 - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, - model_name: str): - - with pytest.raises( - (openai.BadRequestError, openai.APIError)): # test using token IDs - await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - # vLLM has higher default max_logprobs (20 instead of 5) to support - # both Completion API and Chat Completion API - logprobs=21, - ) - ... - with pytest.raises( - (openai.BadRequestError, openai.APIError)): # test using token IDs - stream = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - # vLLM has higher default max_logprobs (20 instead of 5) to support - # both Completion API and Chat Completion API - logprobs=30, - stream=True, - ) - async for chunk in stream: - ... - - # the server should still work afterwards - completion = await client.completions.create( - model=model_name, - prompt=[0, 0, 0, 0, 0], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 0 - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1), - (MODEL_NAME, 0), - (MODEL_NAME, 1), - (MODEL_NAME, None)]) -async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI, - model_name: str, - prompt_logprobs: Optional[int]): - params: dict = { - "prompt": ["A robot may not injure another robot", "My name is"], - "model": model_name, - } - if prompt_logprobs is not None: - params["extra_body"] = {"prompt_logprobs": prompt_logprobs} - - if prompt_logprobs is not None and prompt_logprobs < 0: - with pytest.raises(BadRequestError): - await client.completions.create(**params) - else: - completion = await client.completions.create(**params) - if prompt_logprobs is not None: - assert completion.choices[0].prompt_logprobs is not None - assert len(completion.choices[0].prompt_logprobs) > 0 - - assert completion.choices[1].prompt_logprobs is not None - assert len(completion.choices[1].prompt_logprobs) > 0 - - else: - assert completion.choices[0].prompt_logprobs is None - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_completion_streaming(client: openai.AsyncOpenAI, - model_name: str): - prompt = "What is an LLM?" - - single_completion = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - ) - single_output = single_completion.choices[0].text - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True) - chunks: list[str] = [] - finish_reason_count = 0 - async for chunk in stream: - chunks.append(chunk.choices[0].text) - if chunk.choices[0].finish_reason is not None: - finish_reason_count += 1 - # finish reason should only return in last block - assert finish_reason_count == 1 - assert chunk.choices[0].finish_reason == "length" - assert chunk.choices[0].text - assert "".join(chunks) == single_output - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): - """Streaming for parallel sampling. - The tokens from multiple samples, are flattened into a single stream, - with an index to indicate which sample the token belongs to. - """ - - prompt = "What is an LLM?" - n = 3 - max_tokens = 5 - - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=max_tokens, - n=n, - stream=True) - chunks: list[list[str]] = [[] for i in range(n)] - finish_reason_count = 0 - async for chunk in stream: - index = chunk.choices[0].index - text = chunk.choices[0].text - chunks[index].append(text) - if chunk.choices[0].finish_reason is not None: - finish_reason_count += 1 - assert finish_reason_count == n - for chunk in chunks: - assert len(chunk) == max_tokens - print("".join(chunk)) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_completion_stream_options(client: openai.AsyncOpenAI, - model_name: str): - prompt = "What is the capital of France?" - - # Test stream=True, stream_options= - # {"include_usage": False, "continuous_usage_stats": False} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": False, - "continuous_usage_stats": - False, - }) - - async for chunk in stream: - assert chunk.usage is None - - # Test stream=True, stream_options= - # {"include_usage": False, "continuous_usage_stats": True} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": False, - "continuous_usage_stats": - True, - }) - async for chunk in stream: - assert chunk.usage is None - - # Test stream=True, stream_options= - # {"include_usage": True, "continuous_usage_stats": False} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": True, - "continuous_usage_stats": - False, - }) - async for chunk in stream: - if chunk.choices[0].finish_reason is None: - assert chunk.usage is None - else: - assert chunk.usage is None - final_chunk = await stream.__anext__() - assert final_chunk.usage is not None - assert final_chunk.usage.prompt_tokens > 0 - assert final_chunk.usage.completion_tokens > 0 - assert final_chunk.usage.total_tokens == ( - final_chunk.usage.prompt_tokens + - final_chunk.usage.completion_tokens) - assert final_chunk.choices == [] - - # Test stream=True, stream_options= - # {"include_usage": True, "continuous_usage_stats": True} - stream = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - stream_options={ - "include_usage": True, - "continuous_usage_stats": - True, - }) - async for chunk in stream: - assert chunk.usage is not None - assert chunk.usage.prompt_tokens > 0 - assert chunk.usage.completion_tokens > 0 - assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + - chunk.usage.completion_tokens) - if chunk.choices[0].finish_reason is not None: - final_chunk = await stream.__anext__() - assert final_chunk.usage is not None - assert final_chunk.usage.prompt_tokens > 0 - assert final_chunk.usage.completion_tokens > 0 - assert final_chunk.usage.total_tokens == ( - final_chunk.usage.prompt_tokens + - final_chunk.usage.completion_tokens) - assert final_chunk.choices == [] - - # Test stream=False, stream_options= - # {"include_usage": None} - with pytest.raises(BadRequestError): - await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": None}) - - # Test stream=False, stream_options= - # {"include_usage": True} - with pytest.raises(BadRequestError): - await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"include_usage": True}) - - # Test stream=False, stream_options= - # {"continuous_usage_stats": None} - with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"continuous_usage_stats": None}) - - # Test stream=False, stream_options= - # {"continuous_usage_stats": True} - with pytest.raises(BadRequestError): - await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=False, - stream_options={"continuous_usage_stats": True}) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): - # test both text and token IDs - for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2): - # test simple list - batch = await client.completions.create( - model=model_name, - prompt=prompts, - max_tokens=5, - temperature=0.0, - ) - assert len(batch.choices) == 2 - assert batch.choices[0].text == batch.choices[1].text - - # test n = 2 - batch = await client.completions.create( - model=model_name, - prompt=prompts, - n=2, - max_tokens=5, - temperature=0.0, - extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but - # not necessary for official client. - use_beam_search=True), - ) - assert len(batch.choices) == 4 - assert batch.choices[0].text != batch.choices[ - 1].text, "beam search should be different" - assert batch.choices[0].text == batch.choices[ - 2].text, "two copies of the same prompt should be the same" - assert batch.choices[1].text == batch.choices[ - 3].text, "two copies of the same prompt should be the same" - - # test streaming - batch = await client.completions.create( - model=model_name, - prompt=prompts, - max_tokens=5, - temperature=0.0, - stream=True, - ) - texts = [""] * 2 - async for chunk in batch: - assert len(chunk.choices) == 1 - choice = chunk.choices[0] - texts[choice.index] += choice.text - assert texts[0] == texts[1] - - -@pytest.mark.asyncio -async def test_logits_bias(client: openai.AsyncOpenAI): - prompt = "Hello, my name is" - max_tokens = 5 - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) - - # Test exclusive selection - token_id = 1000 - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - logit_bias={str(token_id): 100}, - seed=42, - ) - assert len(completion.choices[0].text) >= 5 - response_tokens = tokenizer(completion.choices[0].text, - add_special_tokens=False)["input_ids"] - expected_tokens = tokenizer(tokenizer.decode([token_id] * 5), - add_special_tokens=False)["input_ids"] - assert all([ - response == expected - for response, expected in zip(response_tokens, expected_tokens) - ]) - - # Test ban - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - ) - response_tokens = tokenizer(completion.choices[0].text, - add_special_tokens=False)["input_ids"] - first_response = completion.choices[0].text - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - logit_bias={str(token): -100 - for token in response_tokens}, - ) - assert first_response != completion.choices[0].text - - -@pytest.mark.asyncio -async def test_allowed_token_ids(client: openai.AsyncOpenAI): - prompt = "Hello, my name is" - max_tokens = 1 - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) - - # Test exclusive selection - allowed_ids = [21555, 21557, 21558] - completion = await client.completions.create( - model=MODEL_NAME, - prompt=prompt, - max_tokens=max_tokens, - temperature=0.0, - seed=42, - extra_body=dict(allowed_token_ids=allowed_ids), - logprobs=1, - ) - response_tokens = completion.choices[0].logprobs.tokens - assert len(response_tokens) == 1 - assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids - - -@pytest.mark.asyncio -async def test_structured_outputs_json_completion( - client: openai.AsyncOpenAI, - sample_json_schema, - is_v1_server: bool, -): - if not is_v1_server: - pytest.skip("structured outputs is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt=f"Give an example JSON for an employee profile " - f"that fits this schema: {sample_json_schema}", - n=3, - temperature=1.0, - max_tokens=500, - extra_body=dict(structured_outputs=dict(json=sample_json_schema))) - - assert completion.id is not None - assert len(completion.choices) == 3 - for i in range(3): - output_json = json.loads(completion.choices[i].text) - jsonschema.validate(instance=output_json, schema=sample_json_schema) - - -@pytest.mark.asyncio -async def test_structured_outputs_regex_completion( - client: openai.AsyncOpenAI, - sample_regex, - is_v1_server: bool, -): - if not is_v1_server: - pytest.skip("structured outputs is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt=f"Give an example IPv4 address with this regex: {sample_regex}", - n=3, - temperature=1.0, - max_tokens=20, - extra_body=dict(structured_outputs=dict(regex=sample_regex))) - - assert completion.id is not None - assert len(completion.choices) == 3 - for i in range(3): - assert re.fullmatch(sample_regex, - completion.choices[i].text) is not None - - -@pytest.mark.asyncio -async def test_structured_outputs_choice_completion( - client: openai.AsyncOpenAI, - sample_structured_outputs_choices, - is_v1_server: bool, -): - if not is_v1_server: - pytest.skip("structured outputs is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt="The best language for type-safe systems programming is ", - n=2, - temperature=1.0, - max_tokens=10, - extra_body=dict(structured_outputs=dict( - choice=sample_structured_outputs_choices))) - - assert completion.id is not None - assert len(completion.choices) == 2 - for i in range(2): - assert completion.choices[i].text in sample_structured_outputs_choices - - -@pytest.mark.asyncio -async def test_structured_outputs_grammar(client: openai.AsyncOpenAI, - sample_sql_statements, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("grammar is only supported in v1 engine") - - completion = await client.completions.create( - model=MODEL_NAME, - prompt=("Generate a sql state that select col_1 from " - "table_1 where it is equals to 1"), - temperature=1.0, - max_tokens=500, - extra_body=dict( - structured_outputs=dict(grammar=sample_sql_statements), )) - - content = completion.choices[0].text - - # use Lark to parse the output, and make sure it's a valid parse tree - from lark import Lark - parser = Lark(sample_sql_statements) - parser.parse(content) - - # remove spaces for comparison b/c we removed them in the grammar - ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "") - - assert content.strip() == ground_truth - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - # first test base model, then test loras - "model_name", - [MODEL_NAME, "zephyr-lora"], -) -@pytest.mark.parametrize("logprobs_arg", [1, 0]) -async def test_echo_logprob_completion(client: openai.AsyncOpenAI, - model_name: str, logprobs_arg: int): - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) - # test using text and token IDs - for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]): - completion = await client.completions.create(model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - echo=True, - logprobs=logprobs_arg) - - prompt_text = tokenizer.decode(prompt) if isinstance(prompt, - list) else prompt - assert re.search(r"^" + prompt_text, completion.choices[0].text) - logprobs = completion.choices[0].logprobs - assert logprobs is not None - assert len(logprobs.text_offset) > 5 - assert (len(logprobs.token_logprobs) > 5 - and logprobs.token_logprobs[0] is None) - assert (len(logprobs.top_logprobs) > 5 - and logprobs.top_logprobs[0] is None) - for top_logprobs in logprobs.top_logprobs[1:]: - assert max(logprobs_arg, - 1) <= len(top_logprobs) <= logprobs_arg + 1 - assert len(logprobs.tokens) > 5 - - -@pytest.mark.asyncio -async def test_structured_outputs_type_error(client: openai.AsyncOpenAI, - sample_json_schema, sample_regex, - is_v1_server: bool): - if not is_v1_server: - pytest.skip("structured outputs is only supported in v1 engine") - - with pytest.raises(openai.BadRequestError): - _ = await client.completions.create( - model=MODEL_NAME, - prompt="Give an example JSON that fits this schema: 42", - extra_body=dict(structured_outputs=dict(json=42))) - - with pytest.raises(openai.BadRequestError): - _ = await client.completions.create( - model=MODEL_NAME, - prompt="Give an example string that fits this regex", - extra_body=dict(structured_outputs=dict( - regex=sample_regex, - json=sample_json_schema, - ))) - - -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name,stream,echo", - [ - (MODEL_NAME, False, False), - (MODEL_NAME, False, True), - (MODEL_NAME, True, False), - (MODEL_NAME, True, True) # should not raise BadRequestError error - ], -) -async def test_echo_stream_completion(client: openai.AsyncOpenAI, - model_name: str, stream: bool, - echo: bool): - saying: str = "Hello, my name is" - result = await client.completions.create(model=model_name, - prompt=saying, - max_tokens=10, - temperature=0.0, - echo=echo, - stream=stream) - - stop_reason = "length" - - if not stream: - completion = result - assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - - choice = completion.choices[0] - assert len(choice.text) >= 5 - assert choice.finish_reason == stop_reason - - if echo: - assert choice.text is not None and saying in choice.text - else: - assert choice.text is not None and saying not in choice.text - - else: - chunks: list[str] = [] - final_finish_reason = None - async for chunk in result: - if chunk.choices and chunk.choices[0].text: - chunks.append(chunk.choices[0].text) - if chunk.choices and chunk.choices[0].finish_reason: - final_finish_reason = chunk.choices[0].finish_reason - - assert final_finish_reason == stop_reason - content = "".join(chunks) - if echo: - assert content is not None and saying in content - else: - assert content is not None and saying not in content - - -@pytest.mark.asyncio -async def test_invocations(server: RemoteOpenAIServer, - client: openai.AsyncOpenAI): - request_args = { - "model": MODEL_NAME, - "prompt": "Hello, my name is", - "max_tokens": 5, - "temperature": 0.0, - "logprobs": None, - } - - completion = await client.completions.create(**request_args) - - invocation_response = requests.post(server.url_for("invocations"), - json=request_args) - invocation_response.raise_for_status() - - completion_output = completion.model_dump() - invocation_output = invocation_response.json() - - assert completion_output.keys() == invocation_output.keys() - assert completion_output["choices"] == invocation_output["choices"] diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 7b58f851a4d21..3d56291bc793c 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -14,6 +14,9 @@ from transformers import AutoConfig from ...utils import RemoteOpenAIServer +pytest.skip("Skipping prompt_embeds test until V1 supports it.", + allow_module_level=True) + # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 10c0cb5f4d151..6f2addd3649da 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -53,12 +53,13 @@ def monkeypatch_module(): mpatch.undo() -@pytest.fixture(scope="module", params=[False, True]) +@pytest.fixture(scope="module", params=[True]) def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files): use_v1 = request.param - monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') + assert use_v1 + monkeypatch_module.setenv('VLLM_USE_V1', '1') # Define the json format LoRA module configurations lora_module_1 = { diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 0c9e0f3a51429..8917aa5a5efb9 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -22,7 +22,7 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" PREV_MINOR_VERSION = version._prev_minor_version() -@pytest.fixture(scope="module", params=[True, False]) +@pytest.fixture(scope="module", params=[True]) def use_v1(request): # Module-scoped variant of run_with_both_engines # diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index 5f43fdc9588f3..ef9d5234f2317 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -10,8 +10,30 @@ import pytest from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer -from .test_completion import default_server_args # noqa: F401 -from .test_completion import MODEL_NAME + +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" + + +@pytest.fixture(scope="module") +def default_server_args(zephyr_lora_files): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--max-num-seqs", + "128", + "--enforce-eager", + # lora config + "--enable-lora", + "--lora-modules", + f"zephyr-lora={zephyr_lora_files}", + "--max-lora-rank", + "64", + "--max-cpu-loras", + "2", + ] @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py index 840e0dac81c97..b469fc76fc7a2 100644 --- a/tests/entrypoints/openai/test_skip_tokenizer.py +++ b/tests/entrypoints/openai/test_skip_tokenizer.py @@ -15,14 +15,6 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11" DTYPE = "float16" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module") def server(): args = [ diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 794c1f68f1471..28c24f62895ab 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -7,7 +7,6 @@ import pytest import vllm.envs as envs from vllm import LLM from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine MODEL = "meta-llama/Llama-3.2-1B-Instruct" @@ -96,20 +95,3 @@ def test_v1_attn_backend(monkeypatch): _ = AsyncEngineArgs(model=MODEL).create_engine_config() assert envs.VLLM_USE_V1 m.delenv("VLLM_USE_V1") - - -def test_reject_using_constructor_directly(monkeypatch): - with monkeypatch.context() as m: - if os.getenv("VLLM_USE_V1", None): - m.delenv("VLLM_USE_V1") - - # Sets VLLM_USE_V1=1. - vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config() - - # This uses the V0 constructor directly. - with pytest.raises(ValueError): - AsyncLLMEngine(vllm_config, - AsyncLLMEngine._get_executor_cls(vllm_config), - log_stats=True) - - m.delenv("VLLM_USE_V1") diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 6793041abc502..ede027759a8b2 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,1032 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import asyncio -import time -import weakref -from functools import partial -from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, - Mapping, Optional, Set, Tuple, Type, Union) -from weakref import ReferenceType +from vllm.v1.engine.async_llm import AsyncLLM -import vllm.envs as envs -from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, - SchedulerConfig, VllmConfig) -from vllm.core.scheduler import SchedulerOutputs -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_timeout import asyncio_timeout -from vllm.engine.llm_engine import LLMEngine -from vllm.engine.metrics_types import StatLoggerBase -from vllm.engine.protocol import EngineClient -from vllm.executor.executor_base import ExecutorBase -from vllm.inputs import PromptType -from vllm.inputs.preprocess import InputPreprocessor -from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.outputs import PoolingRequestOutput, RequestOutput -from vllm.pooling_params import PoolingParams -from vllm.sampling_params import SamplingParams -from vllm.sequence import ExecuteModelRequest -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, deprecate_kwargs, weak_bind - -logger = init_logger(__name__) -ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S - - -class AsyncEngineDeadError(RuntimeError): - pass - - -def _log_task_completion(task: asyncio.Task, - error_callback: Callable[[Exception], None]) -> None: - """This function is only intended for the `engine.run_engine_loop()` task. - - In particular, that task runs a `while True` loop that can only exit if - there is an exception. - """ - - exception = None - try: - return_value = task.result() - raise AssertionError( - f"The engine background task should never finish without an " - f"exception. {return_value}") - except asyncio.exceptions.CancelledError: - # We assume that if the task is cancelled, we are gracefully shutting - # down. This should only happen on program exit. - logger.info("Engine is gracefully shutting down.") - except Exception as e: - exception = e - logger.error("Engine background task failed", exc_info=e) - error_callback(exception) - raise AsyncEngineDeadError( - "Task finished unexpectedly. This should never happen! " - "Please open an issue on GitHub. See stack trace above for the " - "actual cause.") from e - - -STOP_ITERATION = Exception() # Sentinel - - -class AsyncStream: - """A stream of RequestOutputs for a request that can be iterated over - asynchronously via an async generator.""" - - def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: - self.request_id = request_id - self._cancel = cancel - self._queue: asyncio.Queue = asyncio.Queue() - self._finished = False - - def put(self, item: Union[RequestOutput, Exception]) -> None: - if not self._finished: - self._queue.put_nowait(item) - - def finish( - self, - exception: Optional[Union[BaseException, Type[BaseException]]] = None, - ) -> None: - if not self._finished: - self._finished = True - self._queue.put_nowait( - exception if self._is_raisable(exception) else STOP_ITERATION) - - @property - def finished(self) -> bool: - return self._finished - - async def generator(self) -> AsyncGenerator[RequestOutput, None]: - try: - while True: - result = await self._queue.get() - if self._is_raisable(result): - if result == STOP_ITERATION: - return - raise result - yield result - except GeneratorExit: - self._cancel(self.request_id) - raise asyncio.CancelledError from None - - @staticmethod - def _is_raisable(value: Any): - return isinstance(value, BaseException) or \ - (isinstance(value, type) and \ - issubclass(value, BaseException)) - - -class RequestTracker: - """Synchronous abstraction for tracking requests.""" - - def __init__(self) -> None: - self._request_streams: Dict[str, AsyncStream] = {} - self._aborted_requests: asyncio.Queue[str] = asyncio.Queue() - self._new_requests: asyncio.Queue[Tuple[AsyncStream, - dict]] = asyncio.Queue() - self.new_requests_event = asyncio.Event() - - def __contains__(self, item): - return item in self._request_streams - - def __len__(self) -> int: - return len(self._request_streams) - - def propagate_exception(self, - exc: Exception, - request_id: Optional[str] = None) -> None: - """Propagate an exception to request streams - (all if request_id is None).""" - if request_id is not None: - self.abort_request(request_id, exception=exc) - else: - # NB: tuple() used here because self.abort_request pops the stream - # out of self._request_streams, so we can't iterate on it directly - for rid in tuple(self._request_streams.keys()): - self.abort_request(rid, exception=exc) - - def process_request_output(self, - request_output: RequestOutput, - *, - verbose: bool = False) -> None: - """Process a request output from the engine.""" - request_id = request_output.request_id - finished = request_output.finished - - if finished: - stream = self._request_streams.pop(request_id, None) - else: - stream = self._request_streams.get(request_id) - # Guard against a KeyError which can occur if the request was aborted - # while the output was generated - if stream is not None: - stream.put(request_output) - if finished: - stream.finish() - - if verbose and finished: - logger.info("Finished request %s.", request_id) - - def process_exception(self, - request_id: str, - exception: BaseException, - *, - verbose: bool = False) -> None: - """Propagate an exception from the engine.""" - if verbose: - logger.info("Finished request %s.", request_id) - self.abort_request(request_id, exception=exception) - - def add_request(self, - request_id: str, - *, - verbose: bool = False, - **engine_add_request_kwargs) -> AsyncStream: - """Add a request to be sent to the engine on the next background - loop iteration.""" - if request_id in self._request_streams: - raise KeyError(f"Request {request_id} already exists.") - - abort_request = partial(self.abort_request, verbose=verbose) - stream = AsyncStream(request_id, abort_request) - self._new_requests.put_nowait((stream, { - "request_id": request_id, - **engine_add_request_kwargs - })) - - self.new_requests_event.set() - - if verbose: - logger.info("Added request %s.", request_id) - - return stream - - def abort_request(self, - request_id: str, - *, - exception: Optional[Union[BaseException, - Type[BaseException]]] = None, - verbose: bool = False) -> None: - """Abort a request during next background loop iteration.""" - if verbose: - logger.info("Aborted request %s.", request_id) - - self._aborted_requests.put_nowait(request_id) - - stream = self._request_streams.pop(request_id, None) - if stream is not None: - stream.finish(exception=exception) - - def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]: - """Get the new requests and finished requests to be - sent to the engine.""" - new_requests: List[Dict] = [] - finished_requests: Set[str] = set() - - while not self._aborted_requests.empty(): - request_id = self._aborted_requests.get_nowait() - finished_requests.add(request_id) - - while not self._new_requests.empty(): - stream, new_request = self._new_requests.get_nowait() - request_id = stream.request_id - if request_id in finished_requests: - # The request has already been aborted. - stream.finish(asyncio.CancelledError) - finished_requests.discard(request_id) - else: - self._request_streams[request_id] = stream - new_requests.append(new_request) - - return new_requests, finished_requests - - async def wait_for_new_requests(self): - if not self.has_new_requests(): - await self.new_requests_event.wait() - self.new_requests_event.clear() - - def has_new_requests(self): - return not self._new_requests.empty() - - -class _AsyncLLMEngine(LLMEngine): - """Extension of LLMEngine to add async methods.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - async def step_async(self, virtual_engine: int) -> List[RequestOutput]: - """Performs one decoding iteration and returns newly generated results. - The workers are ran asynchronously if possible. - - This function performs one decoding iteration of the engine. It first - schedules the sequences to be executed in the next iteration and the - token blocks to be swapped in/out/copy. Then, it executes the model - and updates the scheduler with the model outputs. Finally, it decodes - the sequences and returns the newly generated results. - """ - # these are cached outputs from previous iterations. None if on first - # iteration - cached_outputs = self.cached_scheduler_outputs[virtual_engine] - seq_group_metadata_list = cached_outputs.seq_group_metadata_list - scheduler_outputs = cached_outputs.scheduler_outputs - allow_async_output_proc = cached_outputs.allow_async_output_proc - - ctx = self.scheduler_contexts[virtual_engine] - - # Clear outputs for each new scheduler iteration - ctx.request_outputs.clear() - - # skip the scheduler if there are any remaining steps in the seq groups. - # This ensures that the scheduler is only called again when the current - # batch has completed. - if not self._has_remaining_steps(seq_group_metadata_list): - - # Schedule iteration - (seq_group_metadata_list, scheduler_outputs, - allow_async_output_proc - ) = self.scheduler[virtual_engine].schedule() - - ctx.seq_group_metadata_list = seq_group_metadata_list - ctx.scheduler_outputs = scheduler_outputs - - if not scheduler_outputs.is_empty(): - # this will cause mamba_cache/minimax_cache failed - # to release finished_requests_ids of the last steps - finished_requests_ids = self.scheduler[ - virtual_engine].get_and_reset_finished_requests_ids() - - # Maybe switch from async mode to sync mode - if not allow_async_output_proc and len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - - else: - finished_requests_ids = list() - - assert seq_group_metadata_list is not None - assert scheduler_outputs is not None - - if not scheduler_outputs.is_empty(): - - # Check if we have a cached last_output from the previous iteration. - # For supporting PP this is probably the best way to pass the - # sampled_token_ids, as a separate broadcast over all the PP stages - # will cause one virtual engine's microbatch to block the pipeline. - last_sampled_token_ids = \ - self._get_last_sampled_token_ids(virtual_engine) - - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in, - blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out, - blocks_to_copy=scheduler_outputs.blocks_to_copy, - virtual_engine=virtual_engine, - num_lookahead_slots=scheduler_outputs.num_lookahead_slots, - running_queue_size=scheduler_outputs.running_queue_size, - finished_requests_ids=finished_requests_ids, - # We use ExecuteModelRequest to pass the last sampled_token_ids - # to each of the non-last PP stages for in-place prepare_input. - last_sampled_token_ids=last_sampled_token_ids) - - if allow_async_output_proc: - execute_model_req.async_callback = self.async_callbacks[ - virtual_engine] - - # Execute the model. - outputs = await self.model_executor.execute_model_async( - execute_model_req) - - else: - if len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - outputs = [] - - if not self._has_remaining_steps(seq_group_metadata_list): - # is_first_step_output is True only when the num_steps of all - # the sequences are 1. - is_first_step_output: bool = False if not seq_group_metadata_list \ - else seq_group_metadata_list[0].state.num_steps == 1 - - ctx.append_output(outputs=outputs, - seq_group_metadata_list=seq_group_metadata_list, - scheduler_outputs=scheduler_outputs, - is_async=allow_async_output_proc, - is_last_step=True, - is_first_step_output=is_first_step_output) - - if outputs and allow_async_output_proc: - assert len( - outputs - ) == 1, "Async postprocessor expects only a single output set" - self._advance_to_next_step( - outputs[0], seq_group_metadata_list, - scheduler_outputs.scheduled_seq_groups) - - if not allow_async_output_proc: - self._process_model_outputs(ctx=ctx) - - # Log stats. - self.do_log_stats(scheduler_outputs, outputs) - - # Tracing - self.do_tracing(scheduler_outputs) - - else: - # Multi-step case - return ctx.request_outputs - - if not self.has_unfinished_requests(): - # Drain async postprocessor (if exists) - if len(ctx.output_queue) > 0: - self._process_model_outputs(ctx=ctx) - assert len(ctx.output_queue) == 0 - - return ctx.request_outputs - - async def stop_remote_worker_execution_loop_async(self) -> None: - """Stop the remote worker execution loop.""" - await self.model_executor.stop_remote_worker_execution_loop_async() - - async def get_tokenizer_async(self) -> AnyTokenizer: - return self.get_tokenizer() - - async def add_request_async( - self, - request_id: str, - prompt: PromptType, - params: SamplingParams, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> None: - """ - Async version of - [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]. - """ - if lora_request is not None and not self.lora_config: - raise ValueError(f"Got lora_request {lora_request} but LoRA is " - "not enabled!") - if priority != 0 and not self.scheduler_config.policy == "priority": - raise ValueError(f"Got priority {priority} but " - "Priority scheduling is not enabled.") - if arrival_time is None: - arrival_time = time.time() - - if data_parallel_rank is not None: - raise ValueError("Targeting data_parallel_rank only supported " - "in v1 client.") - - if (isinstance(prompt, dict) - and prompt.get("prompt_embeds", None) is not None - and not prompt.get("prompt_token_ids", None)): - # We use the -2 dimension (instead of 0) in case a batched input - # of batch size 1 is passed in. - prompt["prompt_token_ids"] = [0 - ] * prompt["prompt_embeds"].shape[-2] - - processed_inputs = await self.input_preprocessor.preprocess_async( - prompt, - tokenization_kwargs=tokenization_kwargs, - ) - - self._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=params, - arrival_time=arrival_time, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - ) - - async def check_health_async(self) -> None: - self.model_executor.check_health() - - async def collective_rpc_async(self, - method: str, - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict] = None): - raise NotImplementedError - - -class AsyncLLMEngine(EngineClient): - """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine]. - - This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to - make it asynchronous. It uses asyncio to create a background loop that keeps - processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked - by the generate method when there are requests in the waiting queue. The - generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine] - to the caller. - - Args: - log_requests: Whether to log the requests. - start_engine_loop: If True, the background task to run the engine - will be automatically started in the generate call. - *args: Arguments for [`LLMEngine`][vllm.LLMEngine]. - **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine]. - """ - - _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine - - def __init__(self, - *args: Any, - log_requests: bool = True, - start_engine_loop: bool = True, - **kwargs: Any) -> None: - if envs.VLLM_USE_V1: - raise ValueError( - "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. " - "This should not happen. As a workaround, try using " - "AsyncLLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github.") - - self.log_requests = log_requests - self.engine = self._engine_class(*args, **kwargs) - - # This ensures quick processing of request outputs - # so the append to asyncio queues is not delayed, - # especially for multi-step. - self.use_process_request_outputs_callback = ( - self.engine.model_config.use_async_output_proc) - - if self.use_process_request_outputs_callback: - self.engine.process_request_outputs_callback = \ - weak_bind(self.process_request_outputs) - - self.background_loop: Optional[asyncio.Future] = None - # We need to keep a reference to unshielded - # task as well to prevent it from being garbage - # collected - self._background_loop_unshielded: Optional[asyncio.Task] = None - self.start_engine_loop = start_engine_loop - self._errored_with: Optional[BaseException] = None - - # Lazy initialized fields - self._request_tracker: RequestTracker - - def __del__(self): - if rt := getattr(self, "request_tracker", None): - # Wake up engine loop so that it will exit cleanly - rt.new_requests_event.set() - - @classmethod - def _get_executor_cls(cls, - engine_config: VllmConfig) -> Type[ExecutorBase]: - return LLMEngine._get_executor_cls(engine_config) - - @classmethod - @deprecate_kwargs( - "disable_log_requests", - additional_message=("This argument will have no effect. " - "Use `enable_log_requests` instead."), - ) - def from_vllm_config( - cls, - vllm_config: VllmConfig, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, - enable_log_requests: bool = False, - disable_log_stats: bool = False, - disable_log_requests: bool = True, # Deprecated, will be removed - ) -> "AsyncLLMEngine": - """Create an AsyncLLMEngine from the EngineArgs.""" - - return cls( - vllm_config=vllm_config, - executor_class=cls._get_executor_cls(vllm_config), - start_engine_loop=start_engine_loop, - log_requests=enable_log_requests, - log_stats=not disable_log_stats, - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - - @classmethod - def from_engine_args( - cls, - engine_args: AsyncEngineArgs, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - ) -> "AsyncLLMEngine": - """Creates an async LLM engine from the engine arguments.""" - - vllm_config = engine_args.create_engine_config(usage_context) - - async_engine_cls = cls - if envs.VLLM_USE_V1: - from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine - async_engine_cls = V1AsyncLLMEngine - - return async_engine_cls.from_vllm_config( - vllm_config=vllm_config, - start_engine_loop=start_engine_loop, - usage_context=usage_context, - stat_loggers=stat_loggers, - disable_log_stats=engine_args.disable_log_stats, - enable_log_requests=engine_args.enable_log_requests, - ) - - @property - def is_running(self) -> bool: - return (self.background_loop is not None - and self._background_loop_unshielded is not None - and not self._background_loop_unshielded.done()) - - @property - def is_stopped(self) -> bool: - return self.errored or (self.background_loop is not None and - self._background_loop_unshielded is not None - and self._background_loop_unshielded.done()) - - @property - def errored(self) -> bool: - return self._errored_with is not None - - @property - def dead_error(self) -> BaseException: - return AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - def set_errored(self, exc: Exception) -> None: - self._errored_with = exc - - def _error_callback(self, exc: Exception) -> None: - self.set_errored(exc) - self._request_tracker.propagate_exception(exc) - - async def get_input_preprocessor(self) -> InputPreprocessor: - return self.engine.input_preprocessor - - async def get_tokenizer(self) -> AnyTokenizer: - return self.engine.get_tokenizer() - - def start_background_loop(self) -> None: - """Start the background loop.""" - if self.errored: - raise AsyncEngineDeadError( - "Background loop has errored already.") from self._errored_with - if self.is_running: - raise RuntimeError("Background loop is already running.") - # Initialize the RequestTracker here so it uses the right event loop. - self._request_tracker = RequestTracker() - - self._background_loop_unshielded = asyncio.get_event_loop( - ).create_task(self.run_engine_loop(weakref.ref(self))) - self._background_loop_unshielded.add_done_callback( - partial(_log_task_completion, error_callback=self._error_callback)) - self.background_loop = asyncio.shield(self._background_loop_unshielded) - - def shutdown_background_loop(self) -> None: - """ - Shut down the background loop. - - This method needs to be called during cleanup to remove - references to `self` and properly GC the resources held - by the async LLM engine (e.g., the executors as well as - their resources). - """ - if self._background_loop_unshielded is not None: - self._background_loop_unshielded.cancel() - self._background_loop_unshielded = None - self.background_loop = None - - async def engine_step(self, virtual_engine: int) -> bool: - """Kick the engine to process the waiting requests. - - Returns True if there are in-progress requests.""" - - new_requests, aborted_requests = ( - self._request_tracker.get_new_and_aborted_requests()) - - for new_request in new_requests: - # Add the request into the vLLM engine's waiting queue. - try: - await self.engine.add_request_async(**new_request) - except ValueError as e: - # TODO: use a vLLM specific error for failed validation - self._request_tracker.process_exception( - new_request["request_id"], - e, - verbose=self.log_requests, - ) - - if aborted_requests: - await self._engine_abort(aborted_requests) - - request_outputs = await self.engine.step_async(virtual_engine) - - # Put the outputs into the corresponding streams. - # If used as a callback, then already invoked inside - # LLMEngine's _process_model_outputs - if not self.use_process_request_outputs_callback: - all_finished = self.process_request_outputs(request_outputs) - else: - # For callback case, we only need to detect when all - # requests are finished - all_finished = all(request_output.finished - for request_output in request_outputs) - - return not all_finished - - def process_request_outputs(self, request_outputs) -> bool: - # Put the outputs into the corresponding streams. - all_finished = True - for request_output in request_outputs: - self._request_tracker.process_request_output( - request_output, verbose=self.log_requests) - all_finished = all_finished and request_output.finished - - return all_finished - - async def _engine_abort(self, request_ids: Iterable[str]): - self.engine.abort_request(request_ids) - - @staticmethod - async def run_engine_loop(engine_ref: ReferenceType): - """We use a weakref to the engine so that the running loop - doesn't prevent the engine being garbage collected.""" - engine: Optional[AsyncLLMEngine] = engine_ref() - if not engine: - return - - pipeline_parallel_size = \ - engine.engine.parallel_config.pipeline_parallel_size - has_requests_in_progress = [False] * pipeline_parallel_size - while True: - if not any(has_requests_in_progress): - logger.debug("Waiting for new requests...") - # Stop the execute model loop in parallel workers until there - # are more requests to process. This avoids waiting - # indefinitely in torch.distributed ops which may otherwise - # time out, and unblocks the RPC thread in the workers so that - # they can process any other queued control plane messages, - # such as add/remove lora adapters. - await engine.engine.stop_remote_worker_execution_loop_async() - request_tracker = engine._request_tracker - # Allow engine to be garbage collected while - # waiting for new requests - del engine - await asyncio.sleep(0) - if engine_ref() is None: - return - await request_tracker.wait_for_new_requests() - engine = engine_ref() - if not engine: - return - logger.debug("Got new requests!") - requests_in_progress = [ - asyncio.create_task(engine.engine_step(ve)) - for ve in range(pipeline_parallel_size) - ] - has_requests_in_progress = [True] * pipeline_parallel_size - - # Abort if iteration takes too long due to unrecoverable errors - # (eg. NCCL timeouts). - try: - async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S): - done, _ = await asyncio.wait( - requests_in_progress, - return_when=asyncio.FIRST_COMPLETED) - for _ in range(pipeline_parallel_size): - await asyncio.sleep(0) - for task in done: - result = task.result() - virtual_engine = requests_in_progress.index(task) - has_unfinished_requests = ( - engine.engine. - has_unfinished_requests_for_virtual_engine( - virtual_engine)) - if result or has_unfinished_requests: - requests_in_progress[virtual_engine] = ( - asyncio.create_task( - engine.engine_step(virtual_engine))) - has_requests_in_progress[virtual_engine] = True - else: - has_requests_in_progress[virtual_engine] = False - except asyncio.TimeoutError as exc: - logger.error( - "Engine iteration timed out. This should never happen!") - engine.set_errored(exc) - raise - await asyncio.sleep(0) - - async def add_request( - self, - request_id: str, - prompt: PromptType, - params: SamplingParams, - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> AsyncGenerator[RequestOutput, None]: - if not self.is_running: - if self.start_engine_loop: - self.start_background_loop() - else: - raise AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - if (priority != 0 - and not self.engine.scheduler_config.policy == "priority"): - raise ValueError(f"Got priority {priority} but " - "Priority scheduling is not enabled.") - - stream = self._request_tracker.add_request( - request_id, - verbose=self.log_requests, - prompt=prompt, - params=params, - arrival_time=arrival_time or time.time(), - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - data_parallel_rank=data_parallel_rank, - tokenization_kwargs=tokenization_kwargs, - ) - - return stream.generator() - - async def generate( - self, - prompt: PromptType, - sampling_params: SamplingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, - ) -> AsyncGenerator[RequestOutput, None]: - """Generate outputs for a request. - - Generate outputs for a request. This method is a coroutine. It adds the - request into the waiting queue of the LLMEngine and streams the outputs - from the LLMEngine to the caller. - - Args: - prompt: The prompt to the LLM. See - [`PromptType`][vllm.inputs.PromptType] for more details about - the format of each input. - sampling_params: The sampling parameters of the request. - request_id: The unique id of the request. - lora_request: LoRA request to use for generation, if any. - trace_headers: OpenTelemetry trace headers. - priority: The priority of the request. - Only applicable with priority scheduling. - data_parallel_rank: The (global) data parallel rank that must - handle this request. Only applicable if DP is enabled. - Yields: - The output `RequestOutput` objects from the LLMEngine - for the request. - - Details: - - If the engine is not running, start the background loop, - which iteratively invokes - [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step] - to process the waiting requests. - - Add the request to the engine's `RequestTracker`. - On the next background loop, this request will be sent to - the underlying engine. - Also, a corresponding `AsyncStream` will be created. - - Wait for the request outputs from `AsyncStream` and yield them. - - Example: - >>> # Please refer to entrypoints/api_server.py for - >>> # the complete example. - >>> - >>> # initialize the engine and the example input - >>> # note that engine_args here is AsyncEngineArgs instance - >>> engine = AsyncLLMEngine.from_engine_args(engine_args) - >>> example_input = { - >>> "prompt": "What is LLM?", - >>> "stream": False, # assume the non-streaming case - >>> "temperature": 0.0, - >>> "request_id": 0, - >>> } - >>> - >>> # start the generation - >>> results_generator = engine.generate( - >>> example_input["prompt"], - >>> SamplingParams(temperature=example_input["temperature"]), - >>> example_input["request_id"]) - >>> - >>> # get the results - >>> final_output = None - >>> async for request_output in results_generator: - >>> if await request.is_disconnected(): - >>> # Abort the request if the client disconnects. - >>> await engine.abort(request_id) - >>> # Return or raise an error - >>> ... - >>> final_output = request_output - >>> - >>> # Process and return the final output - >>> ... - """ - try: - async for output in await self.add_request( - request_id, - prompt, - sampling_params, - lora_request=lora_request, - trace_headers=trace_headers, - priority=priority, - data_parallel_rank=data_parallel_rank, - ): - yield LLMEngine.validate_output(output, RequestOutput) - except asyncio.CancelledError: - await self.abort(request_id) - raise - - def encode( - self, - prompt: PromptType, - pooling_params: PoolingParams, - request_id: str, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - tokenization_kwargs: Optional[dict[str, Any]] = None, - ) -> AsyncGenerator[PoolingRequestOutput, None]: - raise NotImplementedError( - "Pooling models are not supported in vLLM V0") - - async def abort(self, request_id: Union[str, Iterable[str]]) -> None: - """Abort a request. - - Abort a submitted request. If the request is finished or not found, - this method will be a no-op. - - Args: - request_id: The unique id of the request. - """ - if not isinstance(request_id, str): - raise RuntimeError("Only single-request abort supported in" - " deprecated V0") - if not self.is_running: - raise AsyncEngineDeadError( - "Background loop is not running. If it was running, " - "inspect the output to find the stacktrace of the " - "error that caused the background loop to stop " - "(AsyncEngineDeadError).") - - return self._abort(request_id) - - def _abort(self, request_id: str) -> None: - """Abort a request. - - Abort a submitted request. If the request is finished or not found, - this method will be a no-op. - - Args: - request_id: The unique id of the request. - """ - self._request_tracker.abort_request(request_id, - exception=asyncio.CancelledError, - verbose=self.log_requests) - - async def get_vllm_config(self) -> VllmConfig: - """Get the vllm configuration of the vLLM engine.""" - return self.engine.get_vllm_config() - - async def get_model_config(self) -> ModelConfig: - """Get the model configuration of the vLLM engine.""" - return self.engine.get_model_config() - - async def get_parallel_config(self) -> ParallelConfig: - """Get the parallel configuration of the vLLM engine.""" - return self.engine.get_parallel_config() - - async def get_scheduler_config(self) -> SchedulerConfig: - """Get the scheduling configuration of the vLLM engine.""" - return self.engine.get_scheduler_config() - - async def get_lora_config(self) -> LoRAConfig: - """Get the lora configuration of the vLLM engine.""" - return self.engine.get_lora_config() - - async def do_log_stats( - self, - scheduler_outputs: Optional[SchedulerOutputs] = None, - model_output: Optional[List[SamplerOutput]] = None) -> None: - self.engine.do_log_stats() - - async def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - t = time.perf_counter() - logger.debug("Starting health check...") - if self.is_stopped: - raise AsyncEngineDeadError("Background loop is stopped.") - - await self.engine.check_health_async() - logger.debug("Health check took %fs", time.perf_counter() - t) - - async def is_tracing_enabled(self) -> bool: - return self.engine.is_tracing_enabled() - - def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None: - self.engine.add_logger(logger_name=logger_name, logger=logger) - - def remove_logger(self, logger_name: str) -> None: - self.engine.remove_logger(logger_name=logger_name) - - async def start_profile(self) -> None: - self.engine.start_profile() - - async def stop_profile(self) -> None: - self.engine.stop_profile() - - async def reset_mm_cache(self) -> None: - self.engine.reset_mm_cache() - - async def reset_prefix_cache(self, - device: Optional[Device] = None) -> None: - self.engine.reset_prefix_cache(device) - - async def sleep(self, level: int = 1) -> None: - await self.reset_prefix_cache() - self.engine.sleep(level) - - async def wake_up(self, tags: Optional[list[str]] = None) -> None: - self.engine.wake_up(tags) - - async def is_sleeping(self) -> bool: - return self.engine.is_sleeping() - - async def add_lora(self, lora_request: LoRARequest) -> bool: - return self.engine.add_lora(lora_request) - - async def collective_rpc(self, - method: str, - timeout: Optional[float] = None, - args: tuple = (), - kwargs: Optional[dict] = None): - """ - Perform a collective RPC call to the given path. - """ - return await self.engine.collective_rpc_async(method, timeout, args, - kwargs) - - -# TODO(v1): Remove this class proxy when V1 goes default. -if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: - from vllm.v1.engine.async_llm import AsyncLLM - - AsyncLLMEngine = AsyncLLM # type: ignore +AsyncLLMEngine = AsyncLLM # type: ignore diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index c3195dbc4697f..8b2acedf805c1 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -11,7 +11,6 @@ import uvicorn from fastapi import FastAPI, Request, Response from vllm import envs -from vllm.engine.async_llm_engine import AsyncEngineDeadError from vllm.engine.protocol import EngineClient from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT, H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT) @@ -154,7 +153,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None: """ @app.exception_handler(RuntimeError) - @app.exception_handler(AsyncEngineDeadError) @app.exception_handler(EngineDeadError) @app.exception_handler(EngineGenerateError) async def runtime_exception_handler(request: Request, __): diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 912e664120929..11031cd616d20 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -38,7 +38,6 @@ from typing_extensions import assert_never import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import (load_chat_template, resolve_hf_chat_template, @@ -201,50 +200,34 @@ async def build_async_engine_client_from_engine_args( vllm_config = engine_args.create_engine_config(usage_context=usage_context) # V1 AsyncLLM. - if envs.VLLM_USE_V1: - if disable_frontend_multiprocessing: - logger.warning( - "V1 is enabled, but got --disable-frontend-multiprocessing. " - "To disable frontend multiprocessing, set VLLM_USE_V1=0.") + assert envs.VLLM_USE_V1 - from vllm.v1.engine.async_llm import AsyncLLM - async_llm: Optional[AsyncLLM] = None - client_count = client_config.pop( - "client_count") if client_config else 1 - client_index = client_config.pop( - "client_index") if client_config else 0 - try: - async_llm = AsyncLLM.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - enable_log_requests=engine_args.enable_log_requests, - disable_log_stats=engine_args.disable_log_stats, - client_addresses=client_config, - client_count=client_count, - client_index=client_index) + if disable_frontend_multiprocessing: + logger.warning( + "V1 is enabled, but got --disable-frontend-multiprocessing. " + "To disable frontend multiprocessing, set VLLM_USE_V1=0.") - # Don't keep the dummy data in memory - await async_llm.reset_mm_cache() + from vllm.v1.engine.async_llm import AsyncLLM + async_llm: Optional[AsyncLLM] = None + client_count = client_config.pop("client_count") if client_config else 1 + client_index = client_config.pop("client_index") if client_config else 0 + try: + async_llm = AsyncLLM.from_vllm_config( + vllm_config=vllm_config, + usage_context=usage_context, + enable_log_requests=engine_args.enable_log_requests, + disable_log_stats=engine_args.disable_log_stats, + client_addresses=client_config, + client_count=client_count, + client_index=client_index) - yield async_llm - finally: - if async_llm: - async_llm.shutdown() + # Don't keep the dummy data in memory + await async_llm.reset_mm_cache() - # V0 AsyncLLM. - else: - - engine_client: Optional[EngineClient] = None - try: - engine_client = AsyncLLMEngine.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - enable_log_requests=engine_args.enable_log_requests, - disable_log_stats=engine_args.disable_log_stats) - yield engine_client - finally: - if engine_client and hasattr(engine_client, "shutdown"): - engine_client.shutdown() + yield async_llm + finally: + if async_llm: + async_llm.shutdown() async def validate_json_request(raw_request: Request): From 064cac7bb7251862a841d8057d83581350edf837 Mon Sep 17 00:00:00 2001 From: Nikhil Gupta Date: Thu, 18 Sep 2025 19:15:23 +0100 Subject: [PATCH 48/58] [fix]: remove data type hardcoding from gptoss model implementation (#23807) Signed-off-by: Nikhil Gupta --- vllm/model_executor/models/gpt_oss.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 990a1d6d883a1..b49fd0d8f88af 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -76,7 +76,6 @@ class OAIAttention(nn.Module): self.sinks = torch.nn.Parameter( torch.empty(config.num_attention_heads // tp_size, - dtype=torch.bfloat16, requires_grad=False)) self.q_size = self.num_attention_heads * self.head_dim // tp_size @@ -145,8 +144,7 @@ class MLPBlock(torch.nn.Module): self.experts_per_token = config.num_experts_per_tok self.world_size = dist.get_world_size() if dist.is_initialized() else 1 self.router = torch.nn.Linear(config.hidden_size, - config.num_local_experts, - dtype=torch.bfloat16) + config.num_local_experts) assert config.intermediate_size % self.world_size == 0 self.experts = FusedMoE(num_experts=config.num_local_experts, top_k=config.num_experts_per_tok, From 38db529f66712502a3cf93488229fc9fd2dc76fc Mon Sep 17 00:00:00 2001 From: Aziz Date: Thu, 18 Sep 2025 21:18:56 +0200 Subject: [PATCH 49/58] [feat]: Create interface for model-specific M-RoPE (#24194) Signed-off-by: AzizCode92 Signed-off-by: Aziz Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/model_executor/models/__init__.py | 11 ++- vllm/model_executor/models/interfaces.py | 68 +++++++++++++ vllm/model_executor/models/qwen2_vl.py | 118 ++++++++++++++++++++++- vllm/v1/worker/gpu_model_runner.py | 33 +++++-- vllm/worker/model_runner.py | 42 +++++--- 5 files changed, 242 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index d3ee6872dd8bf..4ccba64f2c110 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, - SupportsPP, SupportsTranscription, SupportsV0Only, - has_inner_state, supports_lora, supports_multimodal, - supports_pp, supports_transcription, supports_v0_only) +from .interfaces import (HasInnerState, SupportsLoRA, SupportsMRoPE, + SupportsMultiModal, SupportsPP, SupportsTranscription, + SupportsV0Only, has_inner_state, supports_lora, + supports_mrope, supports_multimodal, supports_pp, + supports_transcription, supports_v0_only) from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration, is_pooling_model, is_text_generation_model) from .registry import ModelRegistry @@ -21,6 +22,8 @@ __all__ = [ "supports_lora", "SupportsMultiModal", "supports_multimodal", + "SupportsMRoPE", + "supports_mrope", "SupportsPP", "supports_pp", "SupportsTranscription", diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 8f8e300c84d71..e9c600e36cfa7 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -8,6 +8,7 @@ from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, import numpy as np import torch from torch import Tensor +from transformers import PretrainedConfig from transformers.models.whisper.tokenization_whisper import LANGUAGES from typing_extensions import Self, TypeIs @@ -852,3 +853,70 @@ def supports_eagle3( model: Union[type[object], object], ) -> Union[TypeIs[type[SupportsEagle3]], TypeIs[SupportsEagle3]]: return isinstance(model, SupportsEagle3) + + +@runtime_checkable +class SupportsMRoPE(Protocol): + """The interface required for all models that support M-RoPE.""" + + supports_mrope: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports M-RoPE. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ + + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + second_per_grid_ts: Optional[list[float]] = None, + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """ + Get M-RoPE input positions and delta value for this specific model. + + This method should be implemented by each model that supports M-RoPE + to provide model-specific logic for computing input positions. + + Args: + input_tokens: List of input token IDs + hf_config: HuggingFace model configuration + image_grid_thw: Image grid dimensions (t, h, w) + video_grid_thw: Video grid dimensions (t, h, w) + second_per_grid_ts: Seconds per grid timestep for videos + context_len: Context length + seq_len: Sequence length + audio_feature_lengths: Audio feature lengths for multimodal models + use_audio_in_video: Whether to use audio in video for interleaving + + Returns: + Tuple of (llm_positions, mrope_position_delta) + - llm_positions: Tensor of shape [3, num_tokens] + with T/H/W positions + - mrope_position_delta: Delta for position calculations + """ + ... + + +@overload +def supports_mrope(model: type[object]) -> TypeIs[type[SupportsMRoPE]]: + ... + + +@overload +def supports_mrope(model: object) -> TypeIs[SupportsMRoPE]: + ... + + +def supports_mrope( + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsMRoPE]], TypeIs[SupportsMRoPE]]: + return isinstance(model, SupportsMRoPE) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b6576b783b64a..7f361678ba72e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -32,7 +32,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from transformers import AutoConfig, BatchFeature +from transformers import AutoConfig, BatchFeature, PretrainedConfig from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, Qwen2VLProcessor) from transformers.models.qwen2_vl.configuration_qwen2_vl import ( @@ -73,7 +73,7 @@ from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape -from .interfaces import (MultiModalEmbeddings, SupportsLoRA, +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMRoPE, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, @@ -1096,7 +1096,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] info=Qwen2VLProcessingInfo, dummy_inputs=Qwen2VLDummyInputsBuilder) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsLoRA, SupportsPP): + SupportsLoRA, SupportsPP, SupportsMRoPE): # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( @@ -1109,6 +1109,118 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, "model.": "language_model.model.", }) + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + second_per_grid_ts: Optional[list[float]] = None, + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get M-RoPE input positions for Qwen2-VL model.""" + if image_grid_thw is None: + image_grid_thw = [] + if video_grid_thw is None: + video_grid_thw = [] + if second_per_grid_ts is None: + second_per_grid_ts = [] + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + tokens_per_second = getattr(hf_config.vision_config, + "tokens_per_second", 1.0) + + input_tokens_tensor = torch.tensor(input_tokens) + vision_start_indices = torch.argwhere( + input_tokens_tensor == vision_start_token_id).squeeze(1) + vision_tokens = input_tokens_tensor[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_videos = image_nums, video_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + video_nums): + video_second_per_grid_t = 0.0 + if remain_images > 0: + try: + ed_image = input_tokens.index(image_token_id, st) + except ValueError: + ed_image = len(input_tokens) + 1 + else: + ed_image = len(input_tokens) + 1 + if remain_videos > 0: + try: + ed_video = input_tokens.index(video_token_id, st) + except ValueError: + ed_video = len(input_tokens) + 1 + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_second_per_grid_t = 1.0 + if second_per_grid_ts: + video_second_per_grid_t = second_per_grid_ts[video_index] + video_index += 1 + remain_videos -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = \ + t, h // spatial_merge_size, w // spatial_merge_size + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + t_index = (torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t * + tokens_per_second).long().flatten() + + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( + llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( + llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - + len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4873b586724ec..053e8f0537ed9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -42,6 +42,7 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import (is_mixture_of_experts, supports_eagle3, + supports_mrope, supports_transcription) from vllm.model_executor.models.interfaces_base import ( VllmModelForPooling, is_pooling_model, is_text_generation_model) @@ -730,16 +731,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if mm_input.get("use_audio_in_video") is True: use_audio_in_video = True - req_state.mrope_positions, req_state.mrope_position_delta = \ - MRotaryEmbedding.get_input_positions_tensor( - req_state.prompt_token_ids, - hf_config=self.model_config.hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) + if supports_mrope(self.model): + req_state.mrope_positions, req_state.mrope_position_delta = \ + self.model.get_mrope_input_positions( + req_state.prompt_token_ids, + hf_config=self.model_config.hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + else: + req_state.mrope_positions, req_state.mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + req_state.prompt_token_ids, + hf_config=self.model_config.hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) def _extract_mm_kwargs( self, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 88f83c9dd7e6c..594382650e3c1 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -41,7 +41,8 @@ from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, get_sampler) from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.model_executor.models import supports_lora, supports_multimodal +from vllm.model_executor.models import (supports_lora, supports_mrope, + supports_multimodal) from vllm.model_executor.models.utils import set_cpu_offload_max_bytes from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalKwargs, MultiModalPlaceholderMap, @@ -670,18 +671,33 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): inter_data.seq_ids[seq_idx]] token_ids = seq_data.get_token_ids() - mrope_input_positions, mrope_position_delta = \ - MRotaryEmbedding.get_input_positions( - token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=inter_data.context_lens[seq_idx], - seq_len=inter_data.seq_lens[seq_idx], - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) + if supports_mrope(self.runner.model): + mrope_input_positions, mrope_position_delta = \ + self.runner.model.get_mrope_input_positions( + token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=inter_data.context_lens[seq_idx], + seq_len=inter_data.seq_lens[seq_idx], + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + mrope_input_positions = mrope_input_positions.tolist() + else: + mrope_input_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions( + token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=inter_data.context_lens[seq_idx], + seq_len=inter_data.seq_lens[seq_idx], + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) seq_data.mrope_position_delta = mrope_position_delta inter_data.mrope_input_positions[ From 75fb112d80f680624dc99a00e02be6a45661f948 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:32:24 -0400 Subject: [PATCH 50/58] [Bug] Fix `returned_lse` not Defined issue (#25106) Signed-off-by: yewentao256 Co-authored-by: Tyler Michael Smith --- vllm/v1/attention/backends/mla/cutlass_mla.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 21be17a750df4..ae534f3207b51 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -206,12 +206,11 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): ) if H < MAX_HEADS: - # Extract the subsets of the outputs - returned_lse = lse[:, :H].contiguous( - ) if self.need_to_return_lse_for_decode else lse out = out[:, :H] + if self.need_to_return_lse_for_decode: + lse = lse[:, :H].contiguous() - return out, returned_lse + return out, lse def _forward_decode( self, From d2a30a2d933226d3951ad98cb5de0c74e2e64826 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:38:37 -0400 Subject: [PATCH 51/58] [Bug] Fix torch Compilation Cache Hit Error (#25093) Signed-off-by: yewentao256 --- vllm/config/compilation.py | 12 ------------ vllm/platforms/cuda.py | 17 ++++++++++------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index f8ccc20222615..3618f472e742d 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -563,18 +563,6 @@ class CompilationConfig: self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] - if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput": - # exclude MoE dispatch/combine from capture by ensuring - # piecewise splitting includes them, so communication remains - # outside CUDA graphs while compute can still be graphed. - moe_ops = [ - "vllm.moe_forward", - "vllm.moe_forward_shared", - ] - for op in moe_ops: - if op not in self.splitting_ops: - self.splitting_ops.append(op) - def splitting_ops_contain_attention(self) -> bool: return self.splitting_ops is not None and all( op in self.splitting_ops for op in self._attention_ops) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 8e3436a9e73c5..87d8f2b7481bb 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -191,14 +191,17 @@ class CudaPlatformBase(Platform): compilation_config = vllm_config.compilation_config if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" and parallel_config.data_parallel_size > 1 - and compilation_config.cudagraph_mode - not in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE]): + and compilation_config.cudagraph_mode != CUDAGraphMode.NONE): + # TODO: Piecewise Cuda graph might be enabled + # if torch compile cache key issue fixed + # See https://github.com/vllm-project/vllm/pull/25093 logger.info( - "Data Parallel with DeepEP high-throughput: using PIECEWISE " - "CUDA graphs and excluding MoE ops from capture. Set " - "VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE " - "graphs captured as well.") - compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + "Data Parallel: disabling cudagraphs since DP " + "with DeepEP high-throughput kernels are not CUDA Graph " + "compatible. The DeepEP low-latency kernels are CUDA Graph " + "compatible. Set the all_to_all backend to deepep_low_latency " + "to use those kernels instead.") + compilation_config.cudagraph_mode = CUDAGraphMode.NONE @classmethod def get_current_memory_usage(cls, From 1c3dad22ff92cbf84e0fa8ad1643c560a07944ea Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 18 Sep 2025 13:35:21 -0700 Subject: [PATCH 52/58] [V0 Deprecation] Remove unused async_timeout.py (#25190) Signed-off-by: Woosuk Kwon --- vllm/engine/async_timeout.py | 173 ----------------------------------- 1 file changed, 173 deletions(-) delete mode 100644 vllm/engine/async_timeout.py diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py deleted file mode 100644 index 3b9c055160c1b..0000000000000 --- a/vllm/engine/async_timeout.py +++ /dev/null @@ -1,173 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Workaround for https://github.com/python/cpython/issues/86296 -# -# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py -# Licensed under the Apache License (Apache-2.0) - -import asyncio -import enum -import sys -from types import TracebackType -from typing import Any, Optional, Type - -if sys.version_info[:2] >= (3, 11): - from asyncio import timeout as asyncio_timeout -else: - - class _State(enum.Enum): - INIT = "INIT" - ENTER = "ENTER" - TIMEOUT = "TIMEOUT" - EXIT = "EXIT" - - class Timeout: - # Internal class, please don't instantiate it directly - # Use timeout() and timeout_at() public factories instead. - # - # Implementation note: `async with timeout()` is preferred - # over `with timeout()`. - # While technically the Timeout class implementation - # doesn't need to be async at all, - # the `async with` statement explicitly points that - # the context manager should be used from async function context. - # - # This design allows to avoid many silly misusages. - # - # TimeoutError is raised immediately when scheduled - # if the deadline is passed. - # The purpose is to time out as soon as possible - # without waiting for the next await expression. - - __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler") - - def __init__(self, deadline: Optional[float], - loop: asyncio.AbstractEventLoop) -> None: - self._loop = loop - self._state = _State.INIT - - self._timeout_handler = None # type: Optional[asyncio.Handle] - if deadline is None: - self._deadline = None # type: Optional[float] - else: - self.update(deadline) - - async def __aenter__(self) -> "Timeout": - self._do_enter() - return self - - async def __aexit__( - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> Optional[bool]: - self._do_exit(exc_type) - return None - - @property - def expired(self) -> bool: - """Is timeout expired during execution?""" - return self._state == _State.TIMEOUT - - @property - def deadline(self) -> Optional[float]: - return self._deadline - - def reject(self) -> None: - """Reject scheduled timeout if any.""" - # cancel is maybe better name but - # task.cancel() raises CancelledError in asyncio world. - if self._state not in (_State.INIT, _State.ENTER): - raise RuntimeError(f"invalid state {self._state.value}") - self._reject() - - def _reject(self) -> None: - if self._timeout_handler is not None: - self._timeout_handler.cancel() - self._timeout_handler = None - - def shift(self, delay: float) -> None: - """Advance timeout on delay seconds. - The delay can be negative. - Raise RuntimeError if shift is called when deadline is not scheduled - """ - deadline = self._deadline - if deadline is None: - raise RuntimeError( - "cannot shift timeout if deadline is not scheduled") - self.update(deadline + delay) - - def update(self, deadline: float) -> None: - """Set deadline to absolute value. - deadline argument points on the time in the same clock system - as loop.time(). - If new deadline is in the past the timeout is raised immediately. - Please note: it is not POSIX time but a time with - undefined starting base, e.g. the time of the system power on. - """ - if self._state == _State.EXIT: - raise RuntimeError( - "cannot reschedule after exit from context manager") - if self._state == _State.TIMEOUT: - raise RuntimeError("cannot reschedule expired timeout") - if self._timeout_handler is not None: - self._timeout_handler.cancel() - self._deadline = deadline - if self._state != _State.INIT: - self._reschedule() - - def _reschedule(self) -> None: - assert self._state == _State.ENTER - deadline = self._deadline - if deadline is None: - return - - now = self._loop.time() - if self._timeout_handler is not None: - self._timeout_handler.cancel() - - task = asyncio.current_task() - if deadline <= now: - self._timeout_handler = self._loop.call_soon( - self._on_timeout, task) - else: - self._timeout_handler = self._loop.call_at( - deadline, self._on_timeout, task) - - def _do_enter(self) -> None: - if self._state != _State.INIT: - raise RuntimeError(f"invalid state {self._state.value}") - self._state = _State.ENTER - self._reschedule() - - def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None: - if exc_type is asyncio.CancelledError and \ - self._state == _State.TIMEOUT: - self._timeout_handler = None - raise asyncio.TimeoutError - # timeout has not expired - self._state = _State.EXIT - self._reject() - return None - - def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None: - if task: - task.cancel() - self._state = _State.TIMEOUT - # drop the reference early - self._timeout_handler = None - - def asyncio_timeout(delay: Optional[float]) -> Timeout: - """timeout context manager. - Useful in cases when you want to apply timeout logic around block - of code or in cases when asyncio.wait_for is not suitable. For example: - >>> async with timeout(0.001): - ... async with aiohttp.get('https://github.com') as r: - ... await r.text() - delay - value in seconds or None to disable timeout logic - """ - loop = asyncio.get_running_loop() - deadline = loop.time() + delay if delay is not None else None - return Timeout(deadline, loop) From a53ad626d629e79264f0a6ab6820a4b547f3b1c4 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Thu, 18 Sep 2025 23:53:52 +0300 Subject: [PATCH 53/58] [KV offload][1b/N] rename offloading to kv_offload (#25191) Signed-off-by: Or Ozeri --- .buildkite/test-pipeline.yaml | 2 +- tests/v1/{offloading => kv_offload}/test_worker.py | 4 ++-- vllm/v1/{offloading => kv_offload}/abstract.py | 0 vllm/v1/{offloading => kv_offload}/mediums.py | 2 +- vllm/v1/{offloading => kv_offload}/worker/worker.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename tests/v1/{offloading => kv_offload}/test_worker.py (97%) rename vllm/v1/{offloading => kv_offload}/abstract.py (100%) rename vllm/v1/{offloading => kv_offload}/mediums.py (93%) rename vllm/v1/{offloading => kv_offload}/worker/worker.py (98%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5fd08296625ad..c42ec4f2503d0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -280,7 +280,7 @@ steps: # split the test to avoid interference - pytest -v -s v1/core - pytest -v -s v1/executor - - pytest -v -s v1/offloading + - pytest -v -s v1/kv_offload - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker diff --git a/tests/v1/offloading/test_worker.py b/tests/v1/kv_offload/test_worker.py similarity index 97% rename from tests/v1/offloading/test_worker.py rename to tests/v1/kv_offload/test_worker.py index 2391b565773aa..6cf8aa0875d62 100644 --- a/tests/v1/offloading/test_worker.py +++ b/tests/v1/kv_offload/test_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.v1.offloading.abstract import LoadStoreSpec -from vllm.v1.offloading.worker.worker import (OffloadingHandler, +from vllm.v1.kv_offload.abstract import LoadStoreSpec +from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, OffloadingWorker, TransferResult, TransferSpec) diff --git a/vllm/v1/offloading/abstract.py b/vllm/v1/kv_offload/abstract.py similarity index 100% rename from vllm/v1/offloading/abstract.py rename to vllm/v1/kv_offload/abstract.py diff --git a/vllm/v1/offloading/mediums.py b/vllm/v1/kv_offload/mediums.py similarity index 93% rename from vllm/v1/offloading/mediums.py rename to vllm/v1/kv_offload/mediums.py index 5a1887848c9fc..8962819178459 100644 --- a/vllm/v1/offloading/mediums.py +++ b/vllm/v1/kv_offload/mediums.py @@ -4,7 +4,7 @@ from abc import ABC import numpy as np -from vllm.v1.offloading.abstract import LoadStoreSpec +from vllm.v1.kv_offload.abstract import LoadStoreSpec class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC): diff --git a/vllm/v1/offloading/worker/worker.py b/vllm/v1/kv_offload/worker/worker.py similarity index 98% rename from vllm/v1/offloading/worker/worker.py rename to vllm/v1/kv_offload/worker/worker.py index d2c2045d1f1f6..b7a52a088fb90 100644 --- a/vllm/v1/offloading/worker/worker.py +++ b/vllm/v1/kv_offload/worker/worker.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from vllm.logger import init_logger -from vllm.v1.offloading.abstract import LoadStoreSpec +from vllm.v1.kv_offload.abstract import LoadStoreSpec # a single transfer spec (src_blocks_spec, dst_blocks_spec) TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec] From 9fac6aa30b669de75d8718164cd99676d3530e7d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 18 Sep 2025 17:26:28 -0400 Subject: [PATCH 54/58] [BugFix] Fix DeepGEMM warmup, no m.weight_scale_inv (#25206) Signed-off-by: Lucas Wilkinson --- vllm/model_executor/warmup/deep_gemm_warmup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index a636a714145cf..4d1829cd228cd 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -36,7 +36,7 @@ def _extract_data_from_linear_base_module( assert m.quant_method.quant_config is not None w = m.weight - ws = m.weight_scale_inv + ws = m.weight_scale quant_block_size = m.quant_method.quant_config.weight_block_size assert isinstance(w, torch.Tensor) From 9a4600e4dcbbd13988c31d5198d3ab8b4172ecca Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Thu, 18 Sep 2025 19:03:09 -0500 Subject: [PATCH 55/58] [CORE] Prompt Embeddings Support for v1 Engine (#24278) Signed-off-by: Andrew Sansom Signed-off-by: Andrew Sansom Co-authored-by: Cyrus Leung --- .../test_basic_correctness.py | 10 -- .../test_completion_with_prompt_embeds.py | 1 - .../models/language/generation/test_common.py | 6 -- vllm/engine/arg_utils.py | 24 +++-- vllm/entrypoints/openai/protocol.py | 2 +- vllm/utils/__init__.py | 27 +++++ vllm/v1/core/sched/output.py | 24 +++-- vllm/v1/engine/__init__.py | 3 +- vllm/v1/engine/detokenizer.py | 33 ++++--- vllm/v1/engine/output_processor.py | 25 ++++- vllm/v1/engine/processor.py | 38 +++++-- vllm/v1/request.py | 15 ++- vllm/v1/sample/logits_processor/__init__.py | 2 +- vllm/v1/sample/logits_processor/builtin.py | 6 +- vllm/v1/sample/logits_processor/interface.py | 2 +- vllm/v1/serial_utils.py | 2 +- vllm/v1/worker/gpu_input_batch.py | 55 +++++++++-- vllm/v1/worker/gpu_model_runner.py | 99 ++++++++++++++++++- vllm/v1/worker/tpu_input_batch.py | 6 +- vllm/v1/worker/tpu_model_runner.py | 1 + 20 files changed, 305 insertions(+), 76 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index fba18f197074b..24b1c9a93126c 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -76,11 +76,6 @@ def test_models( model_executor: str, enable_prompt_embeds: bool, ) -> None: - - if enable_prompt_embeds and envs.is_set( - "VLLM_USE_V1") and envs.VLLM_USE_V1: - pytest.skip("enable_prompt_embeds is not supported in v1.") - if not envs.VLLM_USE_V1: if async_scheduling: pytest.skip("async_scheduling only supported in v1.") @@ -164,11 +159,6 @@ def test_models_distributed( extra_env: dict[str, str], enable_prompt_embeds: bool, ) -> None: - - if enable_prompt_embeds and envs.is_set( - "VLLM_USE_V1") and envs.VLLM_USE_V1: - pytest.skip("enable_prompt_embeds is not supported in v1.") - if test_suite != TARGET_TEST_SUITE: pytest.skip(f"Skip test for {test_suite}") diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 3d56291bc793c..0e3fc82f0c033 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -36,7 +36,6 @@ def default_server_args() -> list[str]: "--enforce-eager", # Prompt Embeds server args "--enable-prompt-embeds", - "--no-enable-chunked-prefill", ] diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index a5aa1e3f49743..c14e71cbdb96d 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -125,12 +125,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, # in parts of the operators pytest.skip(f"Skipping '{model}' model test with AITER kernel.") - # Note: can be removed when - # https://github.com/vllm-project/vllm/pull/24278 finished - if current_platform.is_cpu() and use_prompt_embeds: - pytest.skip("Skipping use_prompt_embeds=True with " - "V1-only CPU backend.") - with hf_runner(model) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fb5beab77b270..63282c4253509 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1513,12 +1513,6 @@ class EngineArgs: recommend_to_remove=False) return False - # No text embedding inputs so far. - if self.enable_prompt_embeds: - _raise_or_fallback(feature_name="--enable-prompt-embeds", - recommend_to_remove=False) - return False - # No Mamba or Encoder-Decoder so far. if not model_config.is_v1_compatible: _raise_or_fallback(feature_name=model_config.architectures, @@ -1651,6 +1645,13 @@ class EngineArgs: "models in V0 and has been disabled.") self.enable_prefix_caching = False + if self.enable_prompt_embeds: + logger.warning( + "--enable-prompt-embeds and --enable-prefix-caching " + "are not supported together in V0. Prefix caching has " + "been disabled.") + self.enable_prefix_caching = False + # Set max_num_seqs to 256 for VLLM_V0. if self.max_num_seqs is None: self.max_num_seqs = 256 @@ -1664,6 +1665,17 @@ class EngineArgs: # For pooling tasks the default is False if model_config.runner_type != "pooling": self.enable_chunked_prefill = True + + # TODO: When prefix caching supports prompt embeds inputs, this + # check can be removed. + if (self.enable_prompt_embeds + and self.enable_prefix_caching is not False): + logger.warning( + "--enable-prompt-embeds and --enable-prefix-caching " + "are not supported together in V1. Prefix caching has " + "been disabled.") + self.enable_prefix_caching = False + if self.enable_prefix_caching is None: self.enable_prefix_caching = True else: diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 7ad8e73d89d59..6b54511a66f33 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -973,7 +973,6 @@ class CompletionRequest(OpenAIBaseModel): # https://platform.openai.com/docs/api-reference/completions/create model: Optional[str] = None prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None - prompt_embeds: Optional[Union[bytes, list[bytes]]] = None best_of: Optional[int] = None echo: Optional[bool] = False frequency_penalty: Optional[float] = 0.0 @@ -1009,6 +1008,7 @@ class CompletionRequest(OpenAIBaseModel): # --8<-- [end:completion-sampling-params] # --8<-- [start:completion-extra-params] + prompt_embeds: Optional[Union[bytes, list[bytes]]] = None add_special_tokens: bool = Field( default=True, description=( diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index f13381ecd9ff3..d4013a69e99fe 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -3443,3 +3443,30 @@ def decorate_logs(process_name: Optional[str] = None) -> None: pid = os.getpid() _add_prefix(sys.stdout, process_name, pid) _add_prefix(sys.stderr, process_name, pid) + + +def length_from_prompt_token_ids_or_embeds( + prompt_token_ids: Optional[list[int]], + prompt_embeds: Optional[torch.Tensor], +) -> int: + """Calculate the request length (in number of tokens) give either + prompt_token_ids or prompt_embeds. + """ + prompt_token_len = None if prompt_token_ids is None else len( + prompt_token_ids) + prompt_embeds_len = \ + None if prompt_embeds is None else len(prompt_embeds) + + if prompt_token_len is None: + if prompt_embeds_len is None: + raise ValueError( + "Neither prompt_token_ids nor prompt_embeds were defined.") + return prompt_embeds_len + else: + if (prompt_embeds_len is not None + and prompt_embeds_len != prompt_token_len): + raise ValueError( + "Prompt token ids and prompt embeds had different lengths" + f" prompt_token_ids={prompt_token_len}" + f" prompt_embeds={prompt_embeds_len}") + return prompt_token_len diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 3ec5b91bf2860..209fc2a4404f3 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -11,6 +11,7 @@ from vllm._bc_linter import bc_linter_include if TYPE_CHECKING: import numpy as np import numpy.typing as npt + import torch from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorMetadata) @@ -26,13 +27,14 @@ if TYPE_CHECKING: class NewRequestData: req_id: str - prompt_token_ids: list[int] + prompt_token_ids: Optional[list[int]] mm_features: list[MultiModalFeatureSpec] sampling_params: Optional[SamplingParams] pooling_params: Optional[PoolingParams] block_ids: tuple[list[int], ...] num_computed_tokens: int lora_request: Optional[LoRARequest] + prompt_embeds: Optional[torch.Tensor] = None @classmethod def from_request( @@ -49,9 +51,12 @@ class NewRequestData: block_ids=block_ids, num_computed_tokens=request.num_computed_tokens, lora_request=request.lora_request, + prompt_embeds=request.prompt_embeds, ) - def __repr__(self): + def __repr__(self) -> str: + prompt_embeds_shape = (self.prompt_embeds.shape + if self.prompt_embeds else None) return (f"NewRequestData(" f"req_id={self.req_id}," f"prompt_token_ids={self.prompt_token_ids}," @@ -59,19 +64,26 @@ class NewRequestData: f"sampling_params={self.sampling_params}," f"block_ids={self.block_ids}," f"num_computed_tokens={self.num_computed_tokens}," - f"lora_request={self.lora_request}" + f"lora_request={self.lora_request}," + f"prompt_embeds_shape={prompt_embeds_shape}" ")") # Version of __repr__ with the prompt data obfuscated - def anon_repr(self): + def anon_repr(self) -> str: + prompt_token_ids_len = len( + self.prompt_token_ids + ) if self.prompt_token_ids is not None else None + prompt_embeds_shape = (self.prompt_embeds.shape + if self.prompt_embeds else None) return (f"NewRequestData(" f"req_id={self.req_id}," - f"prompt_token_ids_len={len(self.prompt_token_ids)}," + f"prompt_token_ids_len={prompt_token_ids_len}," f"mm_features={self.mm_features}," f"sampling_params={self.sampling_params}," f"block_ids={self.block_ids}," f"num_computed_tokens={self.num_computed_tokens}," - f"lora_request={self.lora_request}" + f"lora_request={self.lora_request}," + f"prompt_embeds_shape={prompt_embeds_shape}" ")") diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index dec4abec519bd..345f5a464c2cc 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -47,7 +47,7 @@ class EngineCoreRequest( gc=False): # type: ignore[call-arg] request_id: str - prompt_token_ids: list[int] + prompt_token_ids: Optional[list[int]] mm_features: Optional[list[MultiModalFeatureSpec]] sampling_params: Optional[SamplingParams] pooling_params: Optional[PoolingParams] @@ -56,6 +56,7 @@ class EngineCoreRequest( lora_request: Optional[LoRARequest] cache_salt: Optional[str] data_parallel_rank: Optional[int] + prompt_embeds: Optional[torch.Tensor] = None # Index of the client, used to ensure outputs are sent back to the same # client for this request when scaling out the front-end. diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index cf4b06db843bd..8aa36d6a439c1 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -13,6 +13,7 @@ from vllm.engine.output_processor.stop_checker import StopChecker from vllm.logger import init_logger from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) +from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreRequest logger = init_logger(__name__) @@ -179,11 +180,12 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): self.tokenizer: Tokenizer = tokenizer._tokenizer # Find a safe place to start. - prompt_suffix = request.prompt_token_ids + prompt_token_ids = request.prompt_token_ids or [] + prompt_suffix = prompt_token_ids prompt_len = len(prompt_suffix) if prompt_len > 4: for i in range(4, min(prompt_len + 1, 24)): - suffix = request.prompt_token_ids[-i:] + suffix = prompt_token_ids[-i:] if '�' not in self.tokenizer.decode(suffix): prompt_suffix = suffix break @@ -260,16 +262,25 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer): params = request.sampling_params assert params is not None - # Metadata for incremental detokenization. - self.tokens, self.prefix_offset, self.read_offset = ( - convert_prompt_ids_to_tokens( - tokenizer=tokenizer, - prompt_ids=request.prompt_token_ids, - skip_special_tokens=params.skip_special_tokens, - )) + self.prompt_len = length_from_prompt_token_ids_or_embeds( + request.prompt_token_ids, request.prompt_embeds) - self.token_ids.extend(request.prompt_token_ids) - self.prompt_len = len(request.prompt_token_ids) + # Metadata for incremental detokenization. + if request.prompt_token_ids is not None: + self.tokens, self.prefix_offset, self.read_offset = ( + convert_prompt_ids_to_tokens( + tokenizer=tokenizer, + prompt_ids=request.prompt_token_ids, + skip_special_tokens=params.skip_special_tokens, + )) + else: + # Prompt embedding requests cannot be detokenized, in general. + self.tokens = [""] * self.prompt_len + self.prefix_offset = 0 + self.read_offest = 0 + + self.token_ids.extend(request.prompt_token_ids + or [0] * self.prompt_len) self.skip_special_tokens = params.skip_special_tokens self.spaces_between_special_tokens = ( diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 5dad63988daa4..c17dc3e204ecd 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -14,6 +14,7 @@ from vllm.sampling_params import RequestOutputKind from vllm.tracing import (SpanAttributes, SpanKind, Tracer, extract_trace_context) from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason from vllm.v1.engine.detokenizer import IncrementalDetokenizer from vllm.v1.engine.logprobs import LogprobsProcessor @@ -86,7 +87,8 @@ class RequestState: lora_name: Optional[str], output_kind: RequestOutputKind, prompt: Optional[str], - prompt_token_ids: list[int], + prompt_token_ids: Optional[list[int]], + prompt_embeds: Optional[torch.Tensor], logprobs_processor: Optional[LogprobsProcessor], detokenizer: Optional[IncrementalDetokenizer], max_tokens_param: Optional[int], @@ -104,7 +106,9 @@ class RequestState: self.output_kind = output_kind self.prompt = prompt self.prompt_token_ids = prompt_token_ids - self.prompt_len = len(prompt_token_ids) + self.prompt_embeds = prompt_embeds + self.prompt_len = length_from_prompt_token_ids_or_embeds( + self.prompt_token_ids, self.prompt_embeds) self.logprobs_processor = logprobs_processor self.detokenizer = detokenizer self.max_tokens_param = max_tokens_param @@ -165,6 +169,7 @@ class RequestState: output_kind=output_kind, prompt=prompt, prompt_token_ids=request.prompt_token_ids, + prompt_embeds=request.prompt_embeds, logprobs_processor=logprobs_processor, detokenizer=detokenizer, max_tokens_param=max_tokens_param, @@ -223,6 +228,8 @@ class RequestState: first_output = outputs[0] if isinstance(first_output, PoolingOutput): assert len(outputs) == 1 + # Prompt embeddings are currently not supported by pooling requests. + assert self.prompt_token_ids is not None return PoolingRequestOutput( request_id=request_id, outputs=first_output, @@ -236,10 +243,15 @@ class RequestState: else: prompt_logprobs = self.logprobs_processor.prompt_logprobs + # If prompt embeds were used, put placeholder prompt token ids + prompt_token_ids = self.prompt_token_ids + if prompt_token_ids is None and self.prompt_embeds is not None: + prompt_token_ids = [0] * len(self.prompt_embeds) + return RequestOutput( request_id=request_id, prompt=self.prompt, - prompt_token_ids=self.prompt_token_ids, + prompt_token_ids=prompt_token_ids, prompt_logprobs=prompt_logprobs, outputs=cast(list[CompletionOutput], outputs), finished=finished, @@ -469,6 +481,8 @@ class OutputProcessor: arrival_time_nano_seconds = int(req_state.stats.arrival_time * 1e9) trace_context = extract_trace_context(engine_core_output.trace_headers) + prompt_length = length_from_prompt_token_ids_or_embeds( + req_state.prompt_token_ids, req_state.prompt_embeds) with (self.tracer.start_as_current_span( "llm_request", kind=SpanKind.SERVER, @@ -488,7 +502,7 @@ class OutputProcessor: span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, queued_time) span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, - len(req_state.prompt_token_ids)) + prompt_length) span.set_attribute(SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, metrics.num_generation_tokens) span.set_attribute( @@ -544,7 +558,8 @@ class OutputProcessor: assert req_state.stats is not None iteration_stats.update_from_finished_request( finish_reason=finish_reason, - num_prompt_tokens=len(req_state.prompt_token_ids), + num_prompt_tokens=length_from_prompt_token_ids_or_embeds( + req_state.prompt_token_ids, req_state.prompt_embeds), max_tokens_param=req_state.max_tokens_param, req_stats=req_state.stats) self.lora_states.finish_request(req_state) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 71f539583a1be..507e2cd3223fd 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -19,6 +19,7 @@ from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreRequest from vllm.v1.structured_output.backend_guidance import ( validate_guidance_grammar) @@ -390,6 +391,16 @@ class Processor: self._validate_model_inputs(processed_inputs) encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) + # Mypy does not always properly infer the types of some elements of + # discriminated unions of TypedDicts, because of how it handles + # inheritance of TypedDict. If we explicitly extract the items we want + # we can avoid type errors from using `dict.get` later in the method. + prompt_str: Optional[str] = None if decoder_inputs[ + "type"] == "embeds" else decoder_inputs.get("prompt") + prompt_token_ids = decoder_inputs[ + "prompt_token_ids"] if decoder_inputs["type"] != "embeds" else None + prompt_embeds = decoder_inputs["prompt_embeds"] if decoder_inputs[ + "type"] == "embeds" else None sampling_params = None pooling_params = None @@ -398,9 +409,10 @@ class Processor: sampling_params = params.clone() # If unset max tokens, then generate up to the max_model_len. if sampling_params.max_tokens is None: - sampling_params.max_tokens = ( - self.model_config.max_model_len - - len(decoder_inputs["prompt_token_ids"])) + seq_len = length_from_prompt_token_ids_or_embeds( + prompt_token_ids, prompt_embeds) + sampling_params.max_tokens = \ + self.model_config.max_model_len - seq_len sampling_params.update_from_generation_config( self.generation_config_fields, eos_token_id) if self.tokenizer is not None: @@ -430,9 +442,10 @@ class Processor: identifier=decoder_mm_hashes[modality][idx], mm_position=decoder_mm_positions[modality][idx])) - return decoder_inputs.get("prompt"), EngineCoreRequest( + return prompt_str, EngineCoreRequest( request_id=request_id, - prompt_token_ids=decoder_inputs["prompt_token_ids"], + prompt_token_ids=prompt_token_ids, + prompt_embeds=prompt_embeds, mm_features=mm_features, sampling_params=sampling_params, pooling_params=pooling_params, @@ -461,10 +474,17 @@ class Processor: ): model_config = self.model_config - prompt_ids = prompt_inputs["prompt_token_ids"] + prompt_ids = None if prompt_inputs[ + "type"] == "embeds" else prompt_inputs["prompt_token_ids"] + prompt_embeds = prompt_inputs["prompt_embeds"] if prompt_inputs[ + "type"] == "embeds" else None + prompt_len = length_from_prompt_token_ids_or_embeds( + prompt_ids, prompt_embeds) if not prompt_ids: if prompt_type == "encoder" and model_config.is_multimodal_model: pass # Mllama may have empty encoder inputs for text-only data + elif prompt_inputs["type"] == "embeds": + pass # Prompt embeds should not have prompt_ids. else: raise ValueError(f"The {prompt_type} prompt cannot be empty") @@ -472,7 +492,7 @@ class Processor: tokenizer = None else: tokenizer = self.tokenizer - max_input_id = max(prompt_ids, default=0) + max_input_id = max(prompt_ids or [], default=0) # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while # self.model_config.get_vocab_size() is the model’s vocab size. @@ -490,7 +510,7 @@ class Processor: f"Token id {max_input_id} is out of vocabulary") max_prompt_len = self.model_config.max_model_len - if len(prompt_ids) > max_prompt_len: + if prompt_len > max_prompt_len: if prompt_type == "encoder" and model_config.is_multimodal_model: mm_registry = self.input_preprocessor.mm_registry mm_processor = mm_registry.create_processor( @@ -514,7 +534,7 @@ class Processor: "number of text tokens.") raise ValueError( - f"The {prompt_type} prompt (length {len(prompt_ids)}) is " + f"The {prompt_type} prompt (length {prompt_len}) is " f"longer than the maximum model length of {max_prompt_len}. " f"{suggestion}") diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 145af788d2372..ff10fa00c1cf6 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -7,9 +7,12 @@ from collections.abc import Mapping from functools import partial from typing import TYPE_CHECKING, Any, Callable, Optional, Union +import torch + from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams +from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType, EngineCoreRequest, FinishReason) from vllm.v1.structured_output.request import StructuredOutputRequest @@ -25,12 +28,13 @@ class Request: def __init__( self, request_id: str, - prompt_token_ids: list[int], + prompt_token_ids: Optional[list[int]], sampling_params: Optional[SamplingParams], pooling_params: Optional[PoolingParams], eos_token_id: Optional[int], client_index: int = 0, arrival_time: Optional[float] = None, + prompt_embeds: Optional[torch.Tensor] = None, mm_features: Optional[list[MultiModalFeatureSpec]] = None, lora_request: Optional["LoRARequest"] = None, structured_output_request: Optional["StructuredOutputRequest"] = None, @@ -79,9 +83,13 @@ class Request: "sampling_params and pooling_params can't both be unset") self.prompt_token_ids = prompt_token_ids - self.num_prompt_tokens = len(self.prompt_token_ids) + self.prompt_embeds = prompt_embeds + self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + prompt_token_ids, prompt_embeds) self._output_token_ids: list[int] = [] - self._all_token_ids: list[int] = self.prompt_token_ids.copy() + self._all_token_ids: list[int] = self.prompt_token_ids.copy( + ) if self.prompt_token_ids is not None else [0 + ] * self.num_prompt_tokens self.num_output_placeholders = 0 # Used in async scheduling. self.spec_token_ids: list[int] = [] self.num_computed_tokens = 0 @@ -123,6 +131,7 @@ class Request: request_id=request.request_id, client_index=request.client_index, prompt_token_ids=request.prompt_token_ids, + prompt_embeds=request.prompt_embeds, mm_features=request.mm_features, sampling_params=request.sampling_params, pooling_params=request.pooling_params, diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py index df944873bcaf3..10cad5b530716 100644 --- a/vllm/v1/sample/logits_processor/__init__.py +++ b/vllm/v1/sample/logits_processor/__init__.py @@ -243,7 +243,7 @@ class AdapterLogitsProcessor(LogitsProcessor): def _new_state( self, params: SamplingParams, - prompt_ids: list[int], + prompt_ids: Optional[list[int]], output_ids: list[int], ) -> Optional[partial[torch.Tensor]]: """Return state representation for new request diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py index 60f9c0bdb6313..fc655d993cb4c 100644 --- a/vllm/v1/sample/logits_processor/builtin.py +++ b/vllm/v1/sample/logits_processor/builtin.py @@ -187,7 +187,8 @@ class MinTokensLogitsProcessor(LogitsProcessor): @staticmethod def add_request( - params: SamplingParams, _: list[int], output_tok_ids: list[int] + params: SamplingParams, _: Optional[list[int]], + output_tok_ids: list[int] ) -> Optional[tuple[int, Sequence[int], set[int]]]: min_tokens = params.min_tokens if not min_tokens or len(output_tok_ids) >= min_tokens: @@ -234,7 +235,8 @@ class MinTokensLogitsProcessor(LogitsProcessor): def process_dict_updates( req_entries: dict[int, T], batch_update: Optional[BatchUpdate], - new_state: Callable[[SamplingParams, list[int], list[int]], Optional[T]] + new_state: Callable[[SamplingParams, Optional[list[int]], list[int]], + Optional[T]] ) -> bool: """Utility function to update dict state for sparse LogitsProcessors.""" diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py index 04027359909a6..a84afc2f347a0 100644 --- a/vllm/v1/sample/logits_processor/interface.py +++ b/vllm/v1/sample/logits_processor/interface.py @@ -26,7 +26,7 @@ RemovedRequest = int # (index, params, prompt_tok_ids, output_tok_ids) tuples for new # requests added to the batch. -AddedRequest = tuple[int, SamplingParams, list[int], list[int]] +AddedRequest = tuple[int, SamplingParams, Optional[list[int]], list[int]] # (index 1, index 2, directionality) tuples representing # one-way moves or two-way swaps of requests in batch diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index c8375d6f15517..50c1470c67edc 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -174,7 +174,7 @@ class MsgpackEncoder: ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]: assert self.aux_buffers is not None # view the tensor as a contiguous 1D array of bytes - arr = obj.flatten().contiguous().view(torch.uint8).numpy() + arr = obj.flatten().contiguous().cpu().view(torch.uint8).numpy() if obj.nbytes < self.size_threshold: # Smaller tensors are encoded inline, just like ndarrays. data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 6717622efb801..79a392337574f 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -13,7 +13,7 @@ from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams, SamplingType -from vllm.utils import swap_dict_values +from vllm.utils import length_from_prompt_token_ids_or_embeds, swap_dict_values from vllm.v1.outputs import LogprobsTensors from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import (BatchUpdateBuilder, @@ -29,7 +29,7 @@ from vllm.v1.worker.block_table import MultiGroupBlockTable class CachedRequestState: req_id: str - prompt_token_ids: list[int] + prompt_token_ids: Optional[list[int]] mm_features: list[MultiModalFeatureSpec] sampling_params: Optional[SamplingParams] pooling_params: Optional[PoolingParams] @@ -43,9 +43,11 @@ class CachedRequestState: mrope_position_delta: Optional[int] = None lora_request: Optional[LoRARequest] = None + prompt_embeds: Optional[torch.Tensor] = None def __post_init__(self): - self.num_prompt_tokens = len(self.prompt_token_ids) + self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + self.prompt_token_ids, self.prompt_embeds) @property def num_tokens(self) -> int: @@ -63,6 +65,10 @@ class CachedRequestState: def get_token_id(self, idx: int) -> int: if idx < self.num_prompt_tokens: + if self.prompt_token_ids is None: + raise ValueError( + f"Tried to access token index {idx}, but that token was " + "provided via prompt_embeds, and its ID is unknown.") return self.prompt_token_ids[idx] elif idx - self.num_prompt_tokens < len(self.output_token_ids): return self.output_token_ids[idx - self.num_prompt_tokens] @@ -109,6 +115,14 @@ class InputBatch: pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() + self.is_token_ids = torch.zeros((max_num_reqs, max_model_len), + device="cpu", + dtype=bool, + pin_memory=False) + # Store prompt embeddings per request to avoid OOM from large upfront + # allocation if max_model_len is big. + # Maps req_index -> tensor of shape (num_prompt_tokens, hidden_size) + self.req_prompt_embeds: dict[int, torch.Tensor] = {} self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32) self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) @@ -310,15 +324,23 @@ class InputBatch: self.req_id_to_index[req_id] = req_index # Copy the prompt token ids and output token ids. - num_prompt_tokens = len(request.prompt_token_ids) + num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + request.prompt_token_ids, request.prompt_embeds) self.num_prompt_tokens[req_index] = num_prompt_tokens - self.token_ids_cpu[ - req_index, :num_prompt_tokens] = request.prompt_token_ids start_idx = num_prompt_tokens end_idx = start_idx + len(request.output_token_ids) + if request.prompt_token_ids is not None: + self.token_ids_cpu[ + req_index, :num_prompt_tokens] = request.prompt_token_ids + self.is_token_ids[req_index, :num_prompt_tokens] = True + else: + self.is_token_ids[req_index, :num_prompt_tokens] = False + if request.prompt_embeds is not None: + self.req_prompt_embeds[req_index] = request.prompt_embeds self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids - # Number of token ids in token_ids_cpu. + self.is_token_ids[req_index, start_idx:end_idx] = True + # Number of token ids in prompt (token_ids_cpu or prompt_embeds). # NOTE(woosuk): This may include spec decode tokens. self.num_tokens[req_index] = request.num_tokens # Number of tokens without spec decode tokens. @@ -503,6 +525,20 @@ class InputBatch: self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...] self.token_ids_cpu[i2, ...] = tmp + self.is_token_ids[[i1, i2], ...] = self.is_token_ids[[i2, i1], ...] + + # Swap prompt embeddings if they exist + embeds_i1 = self.req_prompt_embeds.get(i1) + embeds_i2 = self.req_prompt_embeds.get(i2) + if embeds_i1 is not None: + self.req_prompt_embeds[i2] = embeds_i1 + else: + self.req_prompt_embeds.pop(i2, None) + if embeds_i2 is not None: + self.req_prompt_embeds[i1] = embeds_i2 + else: + self.req_prompt_embeds.pop(i1, None) + self.block_table.swap_row(i1, i2) self.request_lora_mapping[i1], self.request_lora_mapping[i2] = \ @@ -592,6 +628,11 @@ class InputBatch: num_tokens = self.num_tokens[last_req_index] self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ last_req_index, :num_tokens] + self.is_token_ids[empty_index, :num_tokens] = self.is_token_ids[ + last_req_index, :num_tokens] + if last_req_index in self.req_prompt_embeds: + self.req_prompt_embeds[ + empty_index] = self.req_prompt_embeds.pop(last_req_index) self.num_tokens[empty_index] = num_tokens self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[ last_req_index] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 053e8f0537ed9..3ee2160a42ffe 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -56,7 +56,9 @@ from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, check_use_alibi, get_dtype_size, - is_pin_memory_available, round_up, supports_dynamo) + is_pin_memory_available, + length_from_prompt_token_ids_or_embeds, round_up, + supports_dynamo) from vllm.v1.attention.backends.flash_attn import AttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( @@ -197,6 +199,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cache_config.cache_dtype] self.is_pooling_model = (model_config.runner_type == 'pooling') + self.enable_prompt_embeds = model_config.enable_prompt_embeds self.is_multimodal_raw_input_only_model = ( model_config.is_multimodal_raw_input_only_model) @@ -342,6 +345,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.hidden_size, dtype=self.dtype, numpy=False) + self.is_token_ids = self._make_buffer(self.max_num_tokens, + dtype=torch.bool) self.discard_request_indices = self._make_buffer(self.max_num_reqs, dtype=torch.int64) self.num_discarded_requests = 0 @@ -574,6 +579,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, + prompt_embeds=new_req_data.prompt_embeds, mm_features=new_req_data.mm_features, sampling_params=sampling_params, pooling_params=pooling_params, @@ -819,6 +825,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if self.input_batch.prev_sampled_token_ids is None: # Normal scheduling case self.input_ids.copy_to_gpu(total_num_scheduled_tokens) + self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) + self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) return # Async scheduling case, where some decode requests from the previous @@ -844,6 +852,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # If not all requests are decodes from the last iteration, # We need to copy the input_ids_cpu to the GPU first. self.input_ids.copy_to_gpu(total_num_scheduled_tokens) + self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) + self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) if num_commmon_tokens == 0: # No requests in common with the previous iteration # So input_ids_cpu will have all the input ids. @@ -857,6 +867,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0], non_blocking=True) + self.is_token_ids.gpu[:num_commmon_tokens] = True return # Upload the index tensors asynchronously # so the scatter can be non-blocking. @@ -947,14 +958,60 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # where M is the max_model_len. token_indices = (positions_np + req_indices * self.input_batch.token_ids_cpu.shape[1]) + token_indices_tensor = torch.from_numpy(token_indices) # NOTE(woosuk): We use torch.index_select instead of np.take here # because torch.index_select is much faster than np.take for large # tensors. torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(), 0, - torch.from_numpy(token_indices), + token_indices_tensor, out=self.input_ids.cpu[:total_num_scheduled_tokens]) + is_token_ids = self.input_batch.is_token_ids.flatten() + torch.index_select( + is_token_ids, + 0, + token_indices_tensor, + out=self.is_token_ids.cpu[:total_num_scheduled_tokens]) + + # Because we did not pre-allocate a massive prompt_embeds CPU tensor on + # the InputBatch, we need to fill in the prompt embeds into the expected + # spots in the GpuModelRunner's pre-allocated prompt_embeds tensor. + if self.input_batch.req_prompt_embeds: + output_idx = 0 + for req_idx in range(num_reqs): + num_sched = num_scheduled_tokens[req_idx] + + # Skip if this request doesn't have embeddings + if req_idx not in self.input_batch.req_prompt_embeds: + output_idx += num_sched + continue + + # Skip if no tokens scheduled + if num_sched <= 0: + output_idx += num_sched + continue + + req_embeds = self.input_batch.req_prompt_embeds[req_idx] + start_pos = self.input_batch.num_computed_tokens_cpu[req_idx] + + # Skip if trying to read beyond available embeddings + if start_pos >= req_embeds.shape[0]: + output_idx += num_sched + continue + + # Copy available embeddings + end_pos = start_pos + num_sched + actual_end = min(end_pos, req_embeds.shape[0]) + actual_num_sched = actual_end - start_pos + + if actual_num_sched > 0: + self.inputs_embeds.cpu[output_idx:output_idx + + actual_num_sched].copy_( + req_embeds[start_pos:actual_end] + ) + + output_idx += num_sched self.input_batch.block_table.compute_slot_mapping( req_indices, positions_np) @@ -1279,7 +1336,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.input_batch.num_computed_tokens_cpu[index] num_scheduled_tokens = \ scheduler_output.num_scheduled_tokens[req_id] - num_prompt_tokens = len(req.prompt_token_ids) + num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + req.prompt_token_ids, req.prompt_embeds) if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: prompt_part_len = max(0, @@ -1845,6 +1903,32 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), } + elif (self.enable_prompt_embeds and get_pp_group().is_first_rank): + # Get the input embeddings for the tokens that are not input embeds, + # then put them into the appropriate positions. + # TODO(qthequartermasterman): Since even when prompt embeds are + # enabled, (a) not all requests will use prompt embeds, and (b) + # after the initial prompt is processed, the rest of the generated + # tokens will be token ids, it is not desirable to have the + # embedding layer outside of the CUDA graph all the time. The v0 + # engine avoids this by "double compiling" the CUDA graph, once + # with input_ids and again with inputs_embeds, for all num_tokens. + # If a batch only has token ids, then including the embedding layer + # in the CUDA graph will be more performant (like in the else case + # below). + token_ids_idx = self.is_token_ids.gpu[:num_scheduled_tokens] \ + .nonzero(as_tuple=False) \ + .squeeze(1) + # Some tokens ids may need to become embeds + if token_ids_idx.numel() > 0: + token_ids = self.input_ids.gpu[token_ids_idx] + tokens_to_embeds = self.model.get_input_embeddings( + input_ids=token_ids) + self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds + + inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] + model_kwargs = self._init_model_kwargs(num_input_tokens) + input_ids = None else: # For text-only models, we use token ids as input. # While it is possible to use embeddings as input just like the @@ -2023,6 +2107,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.input_batch.token_ids_cpu[req_idx, start_idx:end_idx] = sampled_ids + self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True self.input_batch.num_tokens_no_spec[req_idx] = end_idx self.input_batch.num_tokens[req_idx] = end_idx @@ -2570,6 +2655,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Get metadata for this request. request = self.requests[req_id] + if request.prompt_token_ids is None: + # Prompt logprobs is incompatible with prompt embeddings + continue + num_prompt_tokens = len(request.prompt_token_ids) prompt_token_ids = torch.tensor(request.prompt_token_ids).to( self.device, non_blocking=True) @@ -2922,6 +3011,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): **model_kwargs, **self._dummy_mm_kwargs(num_reqs), } + elif self.enable_prompt_embeds: + input_ids = None + inputs_embeds = self.inputs_embeds.gpu[:num_tokens] + model_kwargs = self._init_model_kwargs(num_tokens) else: input_ids = self.input_ids.gpu[:num_tokens] inputs_embeds = None diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py index dfa54d0ad83b6..4cd0ac352de0f 100644 --- a/vllm/v1/worker/tpu_input_batch.py +++ b/vllm/v1/worker/tpu_input_batch.py @@ -9,7 +9,7 @@ import torch from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingType -from vllm.utils import swap_dict_values +from vllm.utils import length_from_prompt_token_ids_or_embeds, swap_dict_values from vllm.v1.outputs import LogprobsTensors from vllm.v1.worker.block_table import MultiGroupBlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState @@ -213,7 +213,9 @@ class InputBatch: self.req_id_to_index[req_id] = req_index # Copy the prompt token ids and output token ids. - num_prompt_tokens = len(request.prompt_token_ids) + num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + request.prompt_token_ids, request.prompt_embeds) + # TODO: copy prompt_embeds self.num_prompt_tokens[req_index] = num_prompt_tokens self.token_ids_cpu[ req_index, :num_prompt_tokens] = request.prompt_token_ids diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 43f12912707f1..01a8e5c3f0dba 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -387,6 +387,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, + prompt_embeds=new_req_data.prompt_embeds, mm_features=new_req_data.mm_features, sampling_params=sampling_params, pooling_params=None, From 9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Fri, 19 Sep 2025 03:20:51 +0300 Subject: [PATCH 56/58] [KV offload][2/N] Introduce LRU-based CPU offloading management (#20075) Signed-off-by: Or Ozeri --- tests/v1/kv_offload/test_cpu.py | 175 +++++++++++++++++++++++++++++ vllm/v1/kv_offload/backend.py | 96 ++++++++++++++++ vllm/v1/kv_offload/backends/cpu.py | 61 ++++++++++ vllm/v1/kv_offload/lru_manager.py | 132 ++++++++++++++++++++++ 4 files changed, 464 insertions(+) create mode 100644 tests/v1/kv_offload/test_cpu.py create mode 100644 vllm/v1/kv_offload/backend.py create mode 100644 vllm/v1/kv_offload/backends/cpu.py create mode 100644 vllm/v1/kv_offload/lru_manager.py diff --git a/tests/v1/kv_offload/test_cpu.py b/tests/v1/kv_offload/test_cpu.py new file mode 100644 index 0000000000000..cdee7811d85b3 --- /dev/null +++ b/tests/v1/kv_offload/test_cpu.py @@ -0,0 +1,175 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable +from dataclasses import dataclass +from typing import Optional + +import numpy as np + +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent, + PrepareStoreOutput) +from vllm.v1.kv_offload.backends.cpu import CPUBackend +from vllm.v1.kv_offload.lru_manager import LRUOffloadingManager +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec + + +@dataclass +class ExpectedPrepareStoreOutput: + block_hashes_to_store: list[int] + store_block_ids: list[int] + block_hashes_evicted: list[int] + + +def to_hashes(int_hashes: list[int]) -> list[BlockHash]: + return [BlockHash(str(i).encode()) for i in int_hashes] + + +def verify_store_output( + prepare_store_output: Optional[PrepareStoreOutput], + expected_prepare_store_output: ExpectedPrepareStoreOutput): + assert prepare_store_output is not None + assert (prepare_store_output.block_hashes_to_store == to_hashes( + expected_prepare_store_output.block_hashes_to_store)) + assert (prepare_store_output.block_hashes_evicted == to_hashes( + expected_prepare_store_output.block_hashes_evicted)) + store_spec = prepare_store_output.store_spec + assert isinstance(store_spec, CPULoadStoreSpec) + expected_array = np.array(expected_prepare_store_output.store_block_ids, + dtype=np.int64) + assert np.array_equal(expected_array, store_spec.block_ids) + + +def verify_load_output(prepare_load_output: LoadStoreSpec, + expected_prepare_load_output: list[int]): + assert isinstance(prepare_load_output, CPULoadStoreSpec) + expected_array = np.array(expected_prepare_load_output, dtype=np.int64) + assert np.array_equal(expected_array, prepare_load_output.block_ids) + + +def verify_events(events: Iterable[OffloadingEvent], + block_size: int, + expected_stores: tuple[set[int], ...] = (), + expected_evictions: tuple[set[int], ...] = ()): + stores: list[set[BlockHash]] = [] + evictions: list[set[BlockHash]] = [] + for event in events: + assert event.medium == CPULoadStoreSpec.medium() + assert event.block_size == block_size + if event.removed: + evictions.append(set(event.block_hashes)) + else: + stores.append(set(event.block_hashes)) + + def to_hash_sets( + int_sets: tuple[set[int], ...]) -> tuple[set[BlockHash], ...]: + return tuple([set(to_hashes(list(int_set))) for int_set in int_sets]) + + assert tuple(evictions) == to_hash_sets(expected_evictions) + assert tuple(stores) == to_hash_sets(expected_stores) + + +def test_cpu_manager(): + """ + Tests LRUOffloadingManager with a CPUBackend. + """ + # initialize a CPU backend with a capacity of 4 blocks + block_size = 256 + cpu_backend = CPUBackend(block_size=block_size, num_blocks=4) + cpu_manager = LRUOffloadingManager(cpu_backend, enable_events=True) + + # prepare store [1, 2] + prepare_store_output = cpu_manager.prepare_store(to_hashes([1, 2])) + verify_store_output( + prepare_store_output, + ExpectedPrepareStoreOutput( + block_hashes_to_store=[1, 2], + store_block_ids=[0, 1], + block_hashes_evicted=[], + )) + + # lookup [1, 2] -> not ready + assert cpu_manager.lookup(to_hashes([1, 2])) == 0 + + # no events so far + assert list(cpu_manager.take_events()) == [] + + # complete store [1, 2] + cpu_manager.complete_store(to_hashes([1, 2])) + verify_events(cpu_manager.take_events(), + block_size=block_size, + expected_stores=({1, 2}, )) + + # lookup [1, 2] + assert cpu_manager.lookup(to_hashes([1])) == 1 + assert cpu_manager.lookup(to_hashes([1, 2])) == 2 + assert cpu_manager.lookup(to_hashes([1, 2, 3])) == 2 + + # prepare store [2, 3, 4, 5] -> evicts [1] + prepare_store_output = cpu_manager.prepare_store(to_hashes([2, 3, 4, 5])) + verify_store_output( + prepare_store_output, + ExpectedPrepareStoreOutput( + block_hashes_to_store=[3, 4, 5], + store_block_ids=[2, 3, 0], + block_hashes_evicted=[1], + )) + + # verify eviction event + verify_events(cpu_manager.take_events(), + block_size=block_size, + expected_evictions=({1}, )) + + # prepare store with no space + assert cpu_manager.prepare_store(to_hashes([1, 6])) is None + + # complete store [2, 3, 4, 5] + cpu_manager.complete_store(to_hashes([2, 3, 4, 5])) + + # prepare load [2, 3] + prepare_load_output = cpu_manager.prepare_load(to_hashes([2, 3])) + verify_load_output(prepare_load_output, [1, 2]) + + # prepare store with no space ([2, 3] is being loaded) + assert cpu_manager.prepare_store(to_hashes([6, 7, 8])) is None + + # complete load [2, 3] + cpu_manager.complete_load(to_hashes([2, 3])) + + # prepare store [6, 7, 8] -> evicts [2, 3, 4] (oldest) + prepare_store_output = cpu_manager.prepare_store(to_hashes([6, 7, 8])) + verify_store_output( + prepare_store_output, + ExpectedPrepareStoreOutput( + block_hashes_to_store=[6, 7, 8], + store_block_ids=[3, 2, 1], + block_hashes_evicted=[2, 3, 4], + )) + + # complete store [6, 7, 8] + cpu_manager.complete_store(to_hashes([6, 7, 8])) + + # touch [5, 6, 7] (move to end of LRU order) + cpu_manager.touch(to_hashes([5, 6, 7])) + + # prepare store [7, 9] -> evicts [8] (oldest following previous touch) + prepare_store_output = cpu_manager.prepare_store(to_hashes([9])) + verify_store_output( + prepare_store_output, + ExpectedPrepareStoreOutput( + block_hashes_to_store=[9], + store_block_ids=[1], + block_hashes_evicted=[8], + )) + + # complete store [7, 9] with failure + cpu_manager.complete_store(to_hashes([7, 9]), success=False) + + # assert [7] is still stored, but [9] is not + assert cpu_manager.lookup(to_hashes([7])) == 1 + assert cpu_manager.lookup(to_hashes([9])) == 0 + + verify_events(cpu_manager.take_events(), + block_size=block_size, + expected_stores=({3, 4, 5}, {6, 7, 8}), + expected_evictions=({2, 3, 4}, {8})) diff --git a/vllm/v1/kv_offload/backend.py b/vllm/v1/kv_offload/backend.py new file mode 100644 index 0000000000000..87a74200116bb --- /dev/null +++ b/vllm/v1/kv_offload/backend.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ctypes +from abc import ABC, abstractmethod +from collections.abc import Iterable + +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.kv_offload.abstract import LoadStoreSpec + + +class BlockStatus(ctypes.Structure): + """ + Offloading status for a single block of KV data. + Holds the following information: + + ref_cnt - the current number of transfers using this block as a source. + A value of -1 indicates the block is not yet ready to be read. + load_store_spec - backend-specific information on how to actually + read/write the block. + """ + _fields_ = [("ref_cnt", ctypes.c_int32)] + + def __init__(self): + super().__init__() + # initialize block as "not ready" (ref_cnt = -1) + self.ref_cnt = -1 + + @property + def is_ready(self) -> bool: + """ + Returns whether the block is ready to be read. + """ + return self.ref_cnt >= 0 + + +class Backend(ABC): + """ + An abstract class for allocating and returning specs for writing + KV blocks to some backend. + """ + + def __init__(self, block_size: int, medium: str): + self.block_size = block_size + self.medium = medium + + @abstractmethod + def get_num_free_blocks(self): + """ + Returns the number of current number of blocks that can be allocated. + """ + pass + + @abstractmethod + def allocate_blocks(self, + block_hashes: list[BlockHash]) -> list[BlockStatus]: + """ + Allocate space for writing blocks. + This method assumes there is enough space for allocation. + It is unsafe to use without checking get_num_free_blocks beforehand. + + Args: + block_hashes: the hashes identifying the blocks to be written. + + Returns: + A list of BlockStatus for the allocated blocks. + The ref_cnt of each returned item will be -1, meaning the block + is not yet ready to be read. + """ + pass + + @abstractmethod + def free(self, block: BlockStatus): + """ + Free a previously allocated block. + You should only call this function with blocks returned by + allocate_blocks, and only once per each block. + + Args: + block: The block to be freed. + """ + pass + + def get_load_store_spec(self, block_hashes: Iterable[BlockHash], + blocks: Iterable[BlockStatus]) -> LoadStoreSpec: + """ + Get backend-specific information on how to read/write blocks. + + Args: + block_hashes: the list of block hashes identifying the blocks. + blocks: the list of blocks. + + Returns: + A LoadStoreSpec that can be used by a worker + to read/write the blocks. + """ + raise NotImplementedError diff --git a/vllm/v1/kv_offload/backends/cpu.py b/vllm/v1/kv_offload/backends/cpu.py new file mode 100644 index 0000000000000..eb1123d1d83ac --- /dev/null +++ b/vllm/v1/kv_offload/backends/cpu.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ctypes +from collections.abc import Iterable + +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.kv_offload.abstract import LoadStoreSpec +from vllm.v1.kv_offload.backend import Backend, BlockStatus +from vllm.v1.kv_offload.mediums import CPULoadStoreSpec + + +class CPUBlockStatus(BlockStatus): + _fields_ = BlockStatus._fields_ + [("block_id", ctypes.c_int64) + ] # type: ignore + + def __init__(self, block_id: int): + super().__init__() + self.block_id = block_id + + +class CPUBackend(Backend): + + def __init__(self, block_size: int, num_blocks: int): + super().__init__(block_size=block_size, + medium=CPULoadStoreSpec.medium()) + + self.num_blocks: int = num_blocks + self.num_allocated_blocks: int = 0 + self.allocated_blocks_free_list: list[int] = [] + + def get_num_free_blocks(self): + return (len(self.allocated_blocks_free_list) + self.num_blocks - + self.num_allocated_blocks) + + def allocate_blocks(self, + block_hashes: list[BlockHash]) -> list[BlockStatus]: + num_fresh_blocks = min(len(block_hashes), + self.num_blocks - self.num_allocated_blocks) + num_reused_blocks = len(block_hashes) - num_fresh_blocks + assert len(self.allocated_blocks_free_list) >= num_reused_blocks + + # allocate fresh blocks + blocks: list[BlockStatus] = [] + for _ in range(num_fresh_blocks): + blocks.append(CPUBlockStatus(self.num_allocated_blocks)) + self.num_allocated_blocks += 1 + + # allocate reused blocks + for _ in range(num_reused_blocks): + block_id = self.allocated_blocks_free_list.pop() + blocks.append(CPUBlockStatus(block_id)) + + return blocks + + def free(self, block: BlockStatus): + assert isinstance(block, CPUBlockStatus) + self.allocated_blocks_free_list.append(block.block_id) + + def get_load_store_spec(self, block_hashes: Iterable[BlockHash], + blocks: Iterable[BlockStatus]) -> LoadStoreSpec: + return CPULoadStoreSpec([block.block_id for block in blocks]) diff --git a/vllm/v1/kv_offload/lru_manager.py b/vllm/v1/kv_offload/lru_manager.py new file mode 100644 index 0000000000000..18d3b1d637b32 --- /dev/null +++ b/vllm/v1/kv_offload/lru_manager.py @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections import OrderedDict +from collections.abc import Iterable +from typing import Optional + +from vllm.v1.core.kv_cache_utils import BlockHash +from vllm.v1.kv_offload.abstract import (LoadStoreSpec, OffloadingEvent, + OffloadingManager, PrepareStoreOutput) +from vllm.v1.kv_offload.backend import Backend, BlockStatus + + +class LRUOffloadingManager(OffloadingManager): + """ + An OffloadingManager with a pluggable backend, which evicts blocks by LRU. + """ + + def __init__(self, backend: Backend, enable_events: bool = False): + self.backend: Backend = backend + # block_hash -> BlockStatus + self.blocks: OrderedDict[BlockHash, BlockStatus] = OrderedDict() + self.events: Optional[list[OffloadingEvent]] = \ + [] if enable_events else None + + def lookup(self, block_hashes: Iterable[BlockHash]) -> int: + hit_count = 0 + for block_hash in block_hashes: + block = self.blocks.get(block_hash) + if block is None or not block.is_ready: + break + hit_count += 1 + return hit_count + + def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec: + blocks = [] + for block_hash in block_hashes: + block = self.blocks[block_hash] + assert block.is_ready + block.ref_cnt += 1 + blocks.append(block) + + return self.backend.get_load_store_spec(block_hashes, blocks) + + def touch(self, block_hashes: Iterable[BlockHash]): + for block_hash in reversed(list(block_hashes)): + if self.blocks.get(block_hash): + self.blocks.move_to_end(block_hash) + + def complete_load(self, block_hashes: Iterable[BlockHash]): + for block_hash in block_hashes: + block = self.blocks[block_hash] + assert block.ref_cnt > 0 + block.ref_cnt -= 1 + + def prepare_store( + self, + block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]: + # filter out blocks that are already stored + block_hashes_to_store = [ + block_hash for block_hash in block_hashes + if block_hash not in self.blocks + ] + + num_blocks_to_evict = (len(block_hashes_to_store) - + self.backend.get_num_free_blocks()) + + # build list of blocks to evict + to_evict = [] + if num_blocks_to_evict > 0: + for block_hash, block in self.blocks.items(): + if block.ref_cnt == 0: + to_evict.append(block_hash) + num_blocks_to_evict -= 1 + if num_blocks_to_evict == 0: + break + else: + # we could not evict enough blocks + return None + + # evict blocks + for block_hash in to_evict: + self.backend.free(self.blocks.pop(block_hash)) + + if to_evict and self.events is not None: + self.events.append( + OffloadingEvent(block_hashes=to_evict, + block_size=self.backend.block_size, + medium=self.backend.medium, + removed=True)) + + blocks = self.backend.allocate_blocks(block_hashes_to_store) + assert len(blocks) == len(block_hashes_to_store) + + for block_hash, block in zip(block_hashes_to_store, blocks): + self.blocks[block_hash] = block + + # build store specs for allocated blocks + store_spec = self.backend.get_load_store_spec(block_hashes_to_store, + blocks) + + return PrepareStoreOutput(block_hashes_to_store=block_hashes_to_store, + store_spec=store_spec, + block_hashes_evicted=to_evict) + + def complete_store(self, + block_hashes: Iterable[BlockHash], + success: bool = True): + stored_block_hashes: list[BlockHash] = [] + if success: + for block_hash in block_hashes: + block = self.blocks[block_hash] + if not block.is_ready: + block.ref_cnt = 0 + stored_block_hashes.append(block_hash) + else: + for block_hash in block_hashes: + block = self.blocks[block_hash] + if not block.is_ready: + self.backend.free(block) + del self.blocks[block_hash] + + if stored_block_hashes and self.events is not None: + self.events.append( + OffloadingEvent(block_hashes=stored_block_hashes, + block_size=self.backend.block_size, + medium=self.backend.medium, + removed=False)) + + def take_events(self) -> Iterable[OffloadingEvent]: + if self.events is not None: + yield from self.events + self.events.clear() From 6d8246aaffff3ebec84767e373212a7b8da328e2 Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Thu, 18 Sep 2025 19:11:59 -0700 Subject: [PATCH 57/58] [gpt-oss] Add ResponseReasoningPartAddedEvent, ResponseReasoningPartDoneEvent for streaming (#24938) Signed-off-by: Andrew Xia --- .../openai/test_response_api_with_harmony.py | 56 +++++++++++- vllm/entrypoints/openai/protocol.py | 88 ++++++++++++++----- vllm/entrypoints/openai/serving_responses.py | 32 ++++--- 3 files changed, 143 insertions(+), 33 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index eceaff672112f..8d974d56b4450 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -287,6 +287,57 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str): assert response3.status == "completed" +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_streaming_types(client: OpenAI, model_name: str): + prompts = [ + "tell me a story about a cat in 20 words", + ] + + # this links the "done" type with the "start" type + # so every "done" type should have a corresponding "start" type + # and every open block should be closed by the end of the stream + pairs_of_event_types = { + "response.completed": "response.created", + "response.output_item.done": "response.output_item.added", + "response.content_part.done": "response.content_part.added", + "response.output_text.done": "response.output_text.delta", + "response.web_search_call.done": "response.web_search_call.added", + "response.reasoning_text.done": "response.reasoning_text.delta", + "response.reasoning_part.done": "response.reasoning_part.added", + } + + for prompt in prompts: + response = await client.responses.create( + model=model_name, + input=prompt, + reasoning={"effort": "low"}, + tools=[], + stream=True, + background=False, + ) + + stack_of_event_types = [] + async for event in response: + if event.type == 'response.created': + stack_of_event_types.append(event.type) + elif event.type == 'response.completed': + assert stack_of_event_types[-1] == pairs_of_event_types[ + event.type] + stack_of_event_types.pop() + if event.type.endswith("added"): + stack_of_event_types.append(event.type) + elif event.type.endswith("delta"): + if stack_of_event_types[-1] == event.type: + continue + stack_of_event_types.append(event.type) + elif event.type.endswith("done"): + assert stack_of_event_types[-1] == pairs_of_event_types[ + event.type] + stack_of_event_types.pop() + assert len(stack_of_event_types) == 0 + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("background", [True, False]) @@ -343,7 +394,10 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool): assert event.item_id == current_item_id # verify content_index_id is correct - if event.type == "response.content_part.added": + if event.type in [ + "response.content_part.added", + "response.reasoning_part.added" + ]: assert event.content_index != current_content_index current_content_index = event.content_index elif event.type in [ diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6b54511a66f33..05d5d6d964dd3 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -31,6 +31,8 @@ from openai.types.responses import ( ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent, ResponseStatus, ResponseWebSearchCallCompletedEvent, ResponseWebSearchCallInProgressEvent, ResponseWebSearchCallSearchingEvent) +from openai.types.responses.response_reasoning_item import ( + Content as ResponseReasoningTextContent) # Backward compatibility for OpenAI client versions try: # For older openai versions (< 1.100.0) @@ -260,26 +262,6 @@ ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam, ResponseReasoningItem, ResponseFunctionToolCall] -StreamingResponsesResponse: TypeAlias = Union[ - ResponseCreatedEvent, - ResponseInProgressEvent, - ResponseCompletedEvent, - ResponseOutputItemAddedEvent, - ResponseOutputItemDoneEvent, - ResponseContentPartAddedEvent, - ResponseContentPartDoneEvent, - ResponseReasoningTextDeltaEvent, - ResponseReasoningTextDoneEvent, - ResponseCodeInterpreterCallInProgressEvent, - ResponseCodeInterpreterCallCodeDeltaEvent, - ResponseWebSearchCallInProgressEvent, - ResponseWebSearchCallSearchingEvent, - ResponseWebSearchCallCompletedEvent, - ResponseCodeInterpreterCallCodeDoneEvent, - ResponseCodeInterpreterCallInterpretingEvent, - ResponseCodeInterpreterCallCompletedEvent, -] - class ResponsesRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation @@ -1916,6 +1898,72 @@ class ResponsesResponse(OpenAIBaseModel): ) +# TODO: this code can be removed once +# https://github.com/openai/openai-python/issues/2634 has been resolved +class ResponseReasoningPartDoneEvent(OpenAIBaseModel): + content_index: int + """The index of the content part that is done.""" + + item_id: str + """The ID of the output item that the content part was added to.""" + + output_index: int + """The index of the output item that the content part was added to.""" + + part: ResponseReasoningTextContent + """The content part that is done.""" + + sequence_number: int + """The sequence number of this event.""" + + type: Literal["response.reasoning_part.done"] + """The type of the event. Always `response.reasoning_part.done`.""" + + +# TODO: this code can be removed once +# https://github.com/openai/openai-python/issues/2634 has been resolved +class ResponseReasoningPartAddedEvent(OpenAIBaseModel): + content_index: int + """The index of the content part that is done.""" + + item_id: str + """The ID of the output item that the content part was added to.""" + + output_index: int + """The index of the output item that the content part was added to.""" + + part: ResponseReasoningTextContent + """The content part that is done.""" + + sequence_number: int + """The sequence number of this event.""" + + type: Literal["response.reasoning_part.added"] + """The type of the event. Always `response.reasoning_part.added`.""" + + +StreamingResponsesResponse: TypeAlias = Union[ + ResponseCreatedEvent, + ResponseInProgressEvent, + ResponseCompletedEvent, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseReasoningTextDeltaEvent, + ResponseReasoningTextDoneEvent, + ResponseReasoningPartAddedEvent, + ResponseReasoningPartDoneEvent, + ResponseCodeInterpreterCallInProgressEvent, + ResponseCodeInterpreterCallCodeDeltaEvent, + ResponseWebSearchCallInProgressEvent, + ResponseWebSearchCallSearchingEvent, + ResponseWebSearchCallCompletedEvent, + ResponseCodeInterpreterCallCodeDoneEvent, + ResponseCodeInterpreterCallInterpretingEvent, + ResponseCodeInterpreterCallCompletedEvent, +] + BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest, RerankRequest] diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 469d74272b0e6..4894623aeac28 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -58,6 +58,8 @@ from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse, InputTokensDetails, OutputTokensDetails, RequestResponseMetadata, + ResponseReasoningPartAddedEvent, + ResponseReasoningPartDoneEvent, ResponsesRequest, ResponsesResponse, ResponseUsage, StreamingResponsesResponse) @@ -1280,14 +1282,13 @@ class OpenAIServingResponses(OpenAIServing): # Deal with tool call here pass elif previous_item.channel == "analysis": + content = ResponseReasoningTextContent( + text=previous_item.content[0].text, + type="reasoning_text", + ) reasoning_item = ResponseReasoningItem( type="reasoning", - content=[ - ResponseReasoningTextContent( - text=previous_item.content[0].text, - type="reasoning_text", - ), - ], + content=[content], status="completed", id=current_item_id, summary=[], @@ -1301,6 +1302,15 @@ class OpenAIServingResponses(OpenAIServing): content_index=current_content_index, text=previous_item.content[0].text, )) + yield _increment_sequence_number_and_return( + ResponseReasoningPartDoneEvent( + type="response.reasoning_part.done", + sequence_number=-1, + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + part=content, + )) yield _increment_sequence_number_and_return( ResponseOutputItemDoneEvent( type="response.output_item.done", @@ -1412,17 +1422,15 @@ class OpenAIServingResponses(OpenAIServing): )) current_content_index += 1 yield _increment_sequence_number_and_return( - ResponseContentPartAddedEvent( - type="response.content_part.added", + ResponseReasoningPartAddedEvent( + type="response.reasoning_part.added", sequence_number=-1, output_index=current_output_index, item_id=current_item_id, content_index=current_content_index, - part=ResponseOutputText( - type="output_text", + part=ResponseReasoningTextContent( text="", - annotations=[], - logprobs=[], + type="reasoning_text", ), )) yield _increment_sequence_number_and_return( From 1a0a04dae94b7a768c0d59b4f687bcf5e12d3127 Mon Sep 17 00:00:00 2001 From: Chen Ding Date: Fri, 19 Sep 2025 11:31:16 +0800 Subject: [PATCH 58/58] [Perf] Optimize memory peak during EAGLE model loading. (#24585) Signed-off-by: Chen Ding --- vllm/model_executor/models/deepseek_eagle.py | 15 ++++++------- vllm/model_executor/models/llama4_eagle.py | 22 +++++++++----------- vllm/model_executor/models/llama_eagle.py | 15 ++++++------- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index b1d7f24c2f18b..2770ddebc48ab 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -229,14 +229,15 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM): return logits def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + + def transform(inputs): + name, loaded_weight = inputs + if "lm_head" not in name: + name = "model." + name + return name, loaded_weight + loader = AutoWeightsLoader( self, skip_prefixes=None, ) - - model_weights = {} - for name, loaded_weight in weights: - if "lm_head" not in name: - name = "model." + name - model_weights[name] = loaded_weight - loader.load_weights(model_weights.items()) + loader.load_weights(map(transform, weights)) diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index ece490ff2f2a8..a203af53205cd 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -205,23 +205,21 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None: + + def transform(inputs): + name, loaded_weight = inputs + name, weight = self.permute_qk_weight_for_rotary( + name, loaded_weight) + if "lm_head" not in name: + name = "model." + name + return name, weight + loader = AutoWeightsLoader( self, # lm_head is tied with target model (Llama4ForCausalLM) skip_prefixes=(["lm_head."]), ) - - model_weights = {} - weights = [ - self.permute_qk_weight_for_rotary(name, loaded_weight) - for name, loaded_weight in weights - ] - for name, loaded_weight in weights: - if "lm_head" not in name: - name = "model." + name - model_weights[name] = loaded_weight - - loader.load_weights(model_weights.items()) + loader.load_weights(map(transform, weights)) def get_input_embeddings( self, diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index a4933b77e3a53..dfae3c3ea5437 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -158,14 +158,15 @@ class EagleLlamaForCausalLM(LlamaForCausalLM): return self.model(input_ids, positions, hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + + def transform(inputs): + name, loaded_weight = inputs + if "lm_head" not in name: + name = "model." + name + return name, loaded_weight + loader = AutoWeightsLoader( self, skip_prefixes=None, ) - - model_weights = {} - for name, loaded_weight in weights: - if "lm_head" not in name: - name = "model." + name - model_weights[name] = loaded_weight - loader.load_weights(model_weights.items()) + loader.load_weights(map(transform, weights))