From b5d90f740048d43376390a61ca5b77c287505d0e Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 29 Oct 2025 16:28:27 -0400 Subject: [PATCH 001/976] [Bug] Fix DBO IMA issue for DeepEPHT (#27666) Signed-off-by: yewentao256 --- .../layers/fused_moe/deepep_ht_prepare_finalize.py | 12 +++++++++--- vllm/v1/worker/ubatching.py | 9 +++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 13866a5c5bf49..929cff79980c0 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -16,6 +16,7 @@ from vllm.utils.math_utils import round_up from vllm.v1.worker.ubatching import ( dbo_current_ubatch_id, dbo_enabled, + dbo_get_previous_event, dbo_switch_to_comm, dbo_switch_to_compute, dbo_switch_to_compute_sync, @@ -110,6 +111,10 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # for the other ubatch before the dispatch kernel starts. dbo_yield_and_switch_from_compute_to_comm() + # capture a DeepEP event and pass it as previous_event so + # DeepEP honors the dependency internally. + previous_event = dbo_get_previous_event(self.buffer.capture) + ( num_tokens_per_rank, num_tokens_per_rdma_rank, @@ -119,7 +124,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): ) = self.buffer.get_dispatch_layout( topk_idx=rank_topk_ids, num_experts=num_experts, - previous_event=None, + previous_event=previous_event, async_finish=False, allocate_on_comm_stream=False, ) @@ -148,7 +153,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # to this value. expert_alignment=1, config=self._get_dispatch_config(), - previous_event=None, + previous_event=previous_event, async_finish=self.async_prepare and not dbo_enabled(), allocate_on_comm_stream=False, ) @@ -339,13 +344,14 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): assert fused_expert_output.dtype == torch.bfloat16, ( f"Expected fused_expert_output bfloat16, got {fused_expert_output.dtype}" ) + previous_event = dbo_get_previous_event(self.buffer.capture) combined_x, _, event = self.buffer.combine( # HT combine only supports BF16 x=fused_expert_output, handle=handle, topk_weights=None, config=self._get_combine_config(), - previous_event=None, + previous_event=previous_event, async_finish=do_async and not dbo_enabled(), allocate_on_comm_stream=False, ) diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py index 6edcb78486380..9f16b1e6d03ee 100644 --- a/vllm/v1/worker/ubatching.py +++ b/vllm/v1/worker/ubatching.py @@ -185,6 +185,15 @@ def dbo_register_recv_hook(recv_hook): next_ctx.recv_hook = recv_hook +def dbo_get_previous_event(func, *args, **kwargs): + if len(_THREAD_ID_TO_CONTEXT) > 0: + ctx_idx = _THREAD_ID_TO_CONTEXT[threading.get_ident()] + ctx = _CURRENT_CONTEXTS[ctx_idx] + # execute callable on the ubatch compute stream to record/wait events there + with torch.cuda.stream(ctx.compute_stream): + return func(*args, **kwargs) + + def make_ubatch_contexts( num_micro_batches: int, compute_stream: torch.cuda.Stream, From 48eb8eba581f0e45272f4e763bf5ec342f77091a Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Wed, 29 Oct 2025 16:17:48 -0700 Subject: [PATCH 002/976] [Temp fix] Disable torch.compile for Qwen2.5 VL's VisionBlock temporarily. (#27760) Signed-off-by: Chenheli Hua Signed-off-by: Roger Wang Co-authored-by: Roger Wang --- vllm/model_executor/models/qwen2_5_vl.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 41cb7084057dd..dfaeb663bbe2f 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -460,15 +460,17 @@ class Qwen2_5_VisionAttention(nn.Module): return output -@support_torch_compile( - dynamic_arg_dims={ - "x": 0, - "cu_seqlens": 0, - "rotary_pos_emb": 0, - "seqlens": 0, - }, - mark_unbacked_dims={"seqlens": 0}, -) +# (FIXME): Enable this after dynamic slicing is fixed +# See https://github.com/vllm-project/vllm/pull/27760 +# @support_torch_compile( +# dynamic_arg_dims={ +# "x": 0, +# "cu_seqlens": 0, +# "rotary_pos_emb": 0, +# "seqlens": 0, +# }, +# mark_unbacked_dims={"seqlens": 0}, +# ) class Qwen2_5_VisionBlock(nn.Module): def __init__( self, From b798e39f931ad42354e0223de3d49e24523b79af Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Thu, 30 Oct 2025 09:43:13 +0800 Subject: [PATCH 003/976] [XPU][bugfix] fix rope for llama4 and deepseek (#25145) Signed-off-by: Yan Ma --- .../layers/rotary_embedding/base.py | 17 +++++++++++++- .../rotary_embedding/deepseek_scaling_rope.py | 4 ++-- .../rotary_embedding/llama4_vision_rope.py | 11 ++-------- .../layers/rotary_embedding/mrope.py | 22 ++----------------- 4 files changed, 22 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 711902f0cc67e..91276320df4d0 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -14,7 +14,7 @@ from .rocm_aiter_rope_ops import ( @CustomOp.register("rotary_embedding") -class RotaryEmbedding(CustomOp): +class RotaryEmbeddingBase(CustomOp): """Original rotary positional embedding.""" def __init__( @@ -86,6 +86,21 @@ class RotaryEmbedding(CustomOp): ): self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype) + +class RotaryEmbedding(RotaryEmbeddingBase): + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + ) -> None: + super().__init__( + head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype + ) + def forward_native( self, positions: torch.Tensor, diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py index 2e5efec066634..d9134f05fddff 100644 --- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py @@ -7,7 +7,7 @@ import torch from vllm.platforms import current_platform -from .base import RotaryEmbedding +from .base import RotaryEmbeddingBase from .common import ( rotate_gptj, rotate_neox, @@ -22,7 +22,7 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: return 0.1 * mscale * math.log(scale) + 1.0 -class DeepseekScalingRotaryEmbedding(RotaryEmbedding): +class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase): """RotaryEmbedding extended with YaRN method. Credits to Peng et al. github.com/jquesnelle/yarn diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py index 6241cb5abbc8e..9fdac309df7ee 100644 --- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py @@ -5,10 +5,10 @@ import math import torch -from .base import RotaryEmbedding +from .base import RotaryEmbeddingBase -class Llama4VisionRotaryEmbedding(RotaryEmbedding): +class Llama4VisionRotaryEmbedding(RotaryEmbeddingBase): def __init__( self, head_size: int, @@ -78,10 +78,3 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding): key: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor | None]: return self.forward_native(query, key) - - def forward_hip( # type: ignore[override] - self, - query: torch.Tensor, - key: torch.Tensor | None = None, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - return self.forward_native(query, key) diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index d269733083d83..3c184ce9d6316 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -7,7 +7,7 @@ import torch from vllm.triton_utils import tl, triton -from .base import RotaryEmbedding +from .base import RotaryEmbeddingBase from .common import apply_rotary_emb_dispatch from .yarn_scaling_rope import YaRNScalingRotaryEmbedding, yarn_get_mscale @@ -199,7 +199,7 @@ def apply_interleaved_rope(x: torch.Tensor, mrope_section: list[int]) -> torch.T return x_t -class MRotaryEmbedding(RotaryEmbedding): +class MRotaryEmbedding(RotaryEmbeddingBase): """Rotary Embedding with Multimodal Sections.""" def __init__( @@ -357,24 +357,6 @@ class MRotaryEmbedding(RotaryEmbedding): key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) return query, key - def forward_xpu( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor | None = None, - offsets: torch.Tensor | None = None, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - return self.forward_native(positions, query, key, offsets) - - def forward_cpu( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor | None = None, - offsets: torch.Tensor | None = None, - ) -> tuple[torch.Tensor, torch.Tensor | None]: - return self.forward_native(positions, query, key, offsets) - @staticmethod def get_next_input_positions( mrope_position_delta: int, From d7fb10c574a3a9cbf596bec086bf02603b71c5c8 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 29 Oct 2025 19:39:57 -0700 Subject: [PATCH 004/976] [Bugfix] mamba-block-size is set for vision language model (#27773) Signed-off-by: Chen Zhang --- vllm/config/cache.py | 10 +--------- vllm/config/vllm.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 1734f6b15d4af..d743d5aa9dd29 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -5,7 +5,7 @@ import hashlib from dataclasses import field from typing import TYPE_CHECKING, Any, Literal -from pydantic import Field, SkipValidation, field_validator, model_validator +from pydantic import Field, SkipValidation, field_validator from pydantic.dataclasses import dataclass from vllm.config.utils import config @@ -185,11 +185,3 @@ class CacheConfig: raise ValueError("Too large swap space. " + msg) elif cpu_memory_usage > 0.4 * total_cpu_memory: logger.warning("Possibly too large swap space. %s", msg) - - @model_validator(mode="after") - def validate_mamba_block_size(self) -> "CacheConfig": - if self.mamba_block_size is not None and not self.enable_prefix_caching: - raise ValueError( - "--mamba-block-size can only be set with --enable-prefix-caching" - ) - return self diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index a7f7f3b45abea..c46f409edab61 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -17,7 +17,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar import torch -from pydantic import ConfigDict, Field +from pydantic import ConfigDict, Field, model_validator from pydantic.dataclasses import dataclass import vllm.envs as envs @@ -943,6 +943,20 @@ class VllmConfig: f"compilation_config={self.compilation_config!r}" ) + @model_validator(mode="after") + def validate_mamba_block_size(self) -> "VllmConfig": + if self.model_config is None: + return self + mamba_block_size_is_set = ( + self.cache_config.mamba_block_size is not None + and self.cache_config.mamba_block_size != self.model_config.max_model_len + ) + if mamba_block_size_is_set and not self.cache_config.enable_prefix_caching: + raise ValueError( + "--mamba-block-size can only be set with --enable-prefix-caching" + ) + return self + _current_vllm_config: VllmConfig | None = None _current_prefix: str | None = None From b5bae42f913efebef6d5239291418df8fb73b555 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 30 Oct 2025 11:17:13 +0800 Subject: [PATCH 005/976] [XPU] Update latest IPEX 2.8 release (#27735) Signed-off-by: Kunshang Ji --- .../scripts/hardware_ci/run-xpu-test.sh | 7 +++++-- .../installation/gpu.xpu.inc.md | 4 +++- requirements/xpu.txt | 2 +- vllm/_ipex_ops.py | 21 +++++-------------- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 250a64fdd071c..27ed67c4517e2 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -20,7 +20,10 @@ trap remove_docker_container EXIT # Run the image and test offline inference/tensor parallel docker run \ - --device /dev/dri \ + --device /dev/dri:/dev/dri \ + --net=host \ + --ipc=host \ + --privileged \ -v /dev/dri/by-path:/dev/dri/by-path \ --entrypoint="" \ -e "HF_TOKEN=${HF_TOKEN}" \ @@ -42,7 +45,7 @@ docker run \ pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py pytest -v -s v1/test_serial_utils.py ' diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md index 9156df9db6df3..620a660a240ed 100644 --- a/docs/getting_started/installation/gpu.xpu.inc.md +++ b/docs/getting_started/installation/gpu.xpu.inc.md @@ -56,8 +56,10 @@ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . docker run -it \ --rm \ --network=host \ - --device /dev/dri \ + --device /dev/dri:/dev/dri \ -v /dev/dri/by-path:/dev/dri/by-path \ + --ipc=host \ + --privileged \ vllm-xpu-env ``` diff --git a/requirements/xpu.txt b/requirements/xpu.txt index d14b631aa9364..e69a98b86036e 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -15,4 +15,4 @@ torchaudio torchvision --extra-index-url=https://download.pytorch.org/whl/xpu -intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl +intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index e773e1d13f0b8..60ee0124c3d9c 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -151,7 +151,9 @@ class ipex_ops: def rms_norm( input: torch.Tensor, weight: torch.Tensor, epsilon: float ) -> torch.Tensor: - return ipex.llm.functional.rms_norm(input, weight, epsilon) + out = torch.empty_like(input) + torch.ops.torch_ipex.rms_norm_vllm(out, input.contiguous(), weight, epsilon) + return out @staticmethod def fused_add_rms_norm( @@ -160,10 +162,7 @@ class ipex_ops: weight: torch.Tensor, epsilon: float, ) -> None: - tmp = ipex.llm.functional.add_rms_norm( - residual, input, weight, None, epsilon, True - ) - input.copy_(tmp) + torch.ops.torch_ipex.fused_add_rms_norm_vllm(input, residual, weight, epsilon) @staticmethod def varlen_attention( @@ -296,16 +295,6 @@ class ipex_ops: num_splits=0, s_aux: torch.Tensor | None = None, ): - if cu_seqlens_k is None: - # cu_seqlens_k is not used in ipex kernel. - cu_seqlens_k = torch.cumsum(seqused_k, dim=0) - cu_seqlens_k = torch.cat( - [ - torch.tensor([0], device=seqused_k.device, dtype=torch.int32), - cu_seqlens_k, - ] - ).to(torch.int32) - real_window_size: tuple[int, int] if window_size is None: real_window_size = (-1, -1) @@ -318,7 +307,7 @@ class ipex_ops: k, v, cu_seqlens_q, - cu_seqlens_k, + seqused_k, max_seqlen_q, max_seqlen_k, softmax_scale, From 2ce5c5d3d65a53e81b5117867f5ce9c873e68334 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 29 Oct 2025 21:04:25 -0700 Subject: [PATCH 006/976] [BugFix] Handle unscheduled requests properly when async scheduling (#27756) Signed-off-by: Nick Hill --- tests/v1/tpu/worker/test_tpu_model_runner.py | 4 +- tests/v1/worker/test_gpu_model_runner.py | 6 +-- .../kv_connector/v1/offloading_connector.py | 2 +- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 4 +- .../v1/shared_storage_connector.py | 2 +- vllm/v1/core/sched/output.py | 32 +++++++++++---- vllm/v1/core/sched/scheduler.py | 39 ++++++++++--------- vllm/v1/worker/gpu_model_runner.py | 15 +++---- vllm/v1/worker/tpu_model_runner.py | 2 +- 9 files changed, 63 insertions(+), 43 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 1aa0709696c41..18aa599f1aaf7 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -212,10 +212,12 @@ def test_update_states_request_resumed(model_runner): # resume req cached_req_data = CachedRequestData( req_ids=[req_id], - resumed_from_preemption=[False], + resumed_req_ids={req_id}, new_token_ids=[[]], + all_token_ids={req_id: scheduler_output.scheduled_new_reqs[0].prompt_token_ids}, new_block_ids=[([],)], num_computed_tokens=[0], + num_output_tokens=[0], ) scheduler_output = SchedulerOutput( diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index c2c34ee95ad5f..9007436350be4 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -259,10 +259,10 @@ def test_update_states_request_resumed(model_runner, dist_init): # resume req cached_req_data = CachedRequestData( req_ids=[req_id], - resumed_from_preemption=[False], + resumed_req_ids=set(), new_token_ids=[[]], - resumed_req_token_ids=[None], - new_block_ids=([[0]],), + all_token_ids={}, + new_block_ids=[([0],)], num_computed_tokens=[0], num_output_tokens=[0], ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 6d4ffc152de97..19344e5784c23 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -494,5 +494,5 @@ def yield_req_data( yield from zip( cached_reqs.req_ids, cached_reqs.new_block_ids, - cached_reqs.resumed_from_preemption, + (req_id in cached_reqs.resumed_req_ids for req_id in cached_reqs.req_ids), ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index e47cde2614fc2..780dd12fccda3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -415,10 +415,10 @@ class P2pNcclConnector(KVConnectorBase_V1): for i, req_id in enumerate(cached_reqs.req_ids): num_computed_tokens = cached_reqs.num_computed_tokens[i] new_block_ids = cached_reqs.new_block_ids[i] - resumed_from_preemption = cached_reqs.resumed_from_preemption[i] + resumed_from_preemption = req_id in cached_reqs.resumed_req_ids if self.is_producer: - num_scheduled_tokens = (scheduler_output.num_scheduled_tokens)[req_id] + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id] num_tokens = num_scheduled_tokens + num_computed_tokens assert req_id in self.chunked_prefill assert new_block_ids is not None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index fc277630603aa..9c230d7d0d2f4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -336,7 +336,7 @@ class SharedStorageConnector(KVConnectorBase_V1): cached_reqs = scheduler_output.scheduled_cached_reqs for i, req_id in enumerate(cached_reqs.req_ids): - resumed_from_preemption = cached_reqs.resumed_from_preemption[i] + resumed_from_preemption = req_id in cached_reqs.resumed_req_ids if not resumed_from_preemption or req_id not in self._requests_need_load: continue diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 035394f045301..cc6b89e2bf3f1 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -2,8 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass +from functools import cached_property from typing import TYPE_CHECKING +from typing_extensions import deprecated + from vllm._bc_linter import bc_linter_include if TYPE_CHECKING: @@ -96,16 +99,16 @@ class NewRequestData: @dataclass class CachedRequestData: req_ids: list[str] - # If resumed_from_preemption is False, new_block_ids will be appended to - # the request's block IDs. If True, new_block_ids will be used as the + # For request ids not in resumed_req_ids, new_block_ids will be appended to + # the request's block IDs. For those in the set, new_block_ids will be used as the # request's block IDs instead of appending to the existing block IDs. - resumed_from_preemption: list[bool] + resumed_req_ids: set[str] # NOTE(woosuk): new_token_ids is only used for pipeline parallelism. # When PP is not used, new_token_ids will be empty. new_token_ids: list[list[int]] - # If resumed_from_preemption is True, propogate the token ids to the - # connector, otherwise will be empty. - resumed_req_token_ids: list[list[int] | None] + # For requests not scheduled in the last step, propagate the token ids to the + # connector. Won't contain requests that were scheduled in the prior step. + all_token_ids: dict[str, list[int]] new_block_ids: list[tuple[list[int], ...] | None] num_computed_tokens: list[int] num_output_tokens: list[int] @@ -114,13 +117,26 @@ class CachedRequestData: def num_reqs(self) -> int: return len(self.req_ids) + @cached_property + @deprecated("use resumed_req_ids field") + def resumed_from_preemption(self) -> list[bool]: + return [req_id in self.resumed_req_ids for req_id in self.req_ids] + + @cached_property + @deprecated("use all_token_ids field") + def resumed_req_token_ids(self) -> list[list[int] | None]: + return [ + self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None + for req_id in self.req_ids + ] + @classmethod def make_empty(cls) -> "CachedRequestData": return cls( req_ids=[], - resumed_from_preemption=[], + resumed_req_ids=set(), new_token_ids=[], - resumed_req_token_ids=[], + all_token_ids={}, new_block_ids=[], num_computed_tokens=[], num_output_tokens=[], diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 00b34fe4fbb98..c794886bc24c8 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -71,6 +71,7 @@ class Scheduler(SchedulerInterface): self.finished_req_ids_dict: dict[int, set[str]] | None = ( defaultdict(set) if include_finished_set else None ) + self.prev_step_scheduled_req_ids: set[str] = set() # Scheduling constraints. self.max_num_running_reqs = self.scheduler_config.max_num_seqs @@ -444,14 +445,9 @@ class Scheduler(SchedulerInterface): # `request.num_prompt_tokens` to consider the resumed # requests, which have output tokens. num_new_tokens = request.num_tokens - num_computed_tokens - if ( - 0 - < self.scheduler_config.long_prefill_token_threshold - < num_new_tokens - ): - num_new_tokens = ( - self.scheduler_config.long_prefill_token_threshold - ) + threshold = self.scheduler_config.long_prefill_token_threshold + if 0 < threshold < num_new_tokens: + num_new_tokens = threshold # chunked prefill has to be enabled explicitly to allow # pooling requests to be chunked @@ -620,6 +616,11 @@ class Scheduler(SchedulerInterface): structured_output_request_ids, grammar_bitmask = self.get_grammar_bitmask( num_scheduled_tokens.keys(), scheduled_spec_decode_tokens ) + + # Record the request ids that were scheduled in this step. + self.prev_step_scheduled_req_ids.clear() + self.prev_step_scheduled_req_ids.update(num_scheduled_tokens.keys()) + scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, scheduled_cached_reqs=cached_reqs_data, @@ -691,14 +692,12 @@ class Scheduler(SchedulerInterface): req_ids: list[str] = [] new_token_ids: list[list[int]] = [] new_block_ids: list[tuple[list[int], ...] | None] = [] - resumed_req_token_ids: list[list[int] | None] = [] + all_token_ids: dict[str, list[int]] = {} num_computed_tokens: list[int] = [] num_output_tokens: list[int] = [] + resumed_req_ids = set() - # Because resumed_reqs is usually empty, it is more efficient to do - # in-place appending so that we don't need to allocate a new list. - resumed_from_preemption = [False] * len(running_reqs) - resumed_from_preemption += [True] * len(resumed_reqs) + num_running_reqs = len(running_reqs) for idx, req in enumerate(itertools.chain(running_reqs, resumed_reqs)): req_id = req.request_id req_ids.append(req_id) @@ -715,12 +714,14 @@ class Scheduler(SchedulerInterface): req.num_computed_tokens : req.num_computed_tokens + num_tokens ] new_token_ids.append(token_ids) - resumed_token_ids = None - if resumed_from_preemption[idx]: - resumed_token_ids = req.all_token_ids[ + scheduled_in_prev_step = req_id in self.prev_step_scheduled_req_ids + if idx >= num_running_reqs: + assert not scheduled_in_prev_step + resumed_req_ids.add(req_id) + if not scheduled_in_prev_step: + all_token_ids[req_id] = req.all_token_ids[ : req.num_computed_tokens + num_tokens ] - resumed_req_token_ids.append(resumed_token_ids) new_block_ids.append( req_to_new_blocks[req_id].get_block_ids(allow_none=True) ) @@ -731,9 +732,9 @@ class Scheduler(SchedulerInterface): return CachedRequestData( req_ids=req_ids, - resumed_from_preemption=resumed_from_preemption, + resumed_req_ids=resumed_req_ids, new_token_ids=new_token_ids, - resumed_req_token_ids=resumed_req_token_ids, + all_token_ids=all_token_ids, new_block_ids=new_block_ids, num_computed_tokens=num_computed_tokens, num_output_tokens=num_output_tokens, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e350988456f12..1fe749c614ccf 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -706,7 +706,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = self.requests[req_id] num_computed_tokens = req_data.num_computed_tokens[i] new_block_ids = req_data.new_block_ids[i] - resumed_from_preemption = req_data.resumed_from_preemption[i] + resumed_from_preemption = req_id in req_data.resumed_req_ids num_output_tokens = req_data.num_output_tokens[i] # Update the cached states. @@ -754,16 +754,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Replace the existing block IDs with the new ones. req_state.block_ids = new_block_ids - if self.use_async_scheduling and num_output_tokens > 0: - # We must recover the output token ids for resumed requests in the - # async scheduling case, so that correct input_ids are obtained. - resumed_token_ids = req_data.resumed_req_token_ids[i] - assert resumed_token_ids is not None - req_state.output_token_ids = resumed_token_ids[-num_output_tokens:] if req_index is None: # The request is not in the persistent batch. # The request was either preempted and resumed later, or was not # scheduled in the previous step and needs to be added again. + + if self.use_async_scheduling and num_output_tokens > 0: + # We must recover the output token ids for resumed requests in the + # async scheduling case, so that correct input_ids are obtained. + resumed_token_ids = req_data.all_token_ids[req_id] + req_state.output_token_ids = resumed_token_ids[-num_output_tokens:] + reqs_to_add.append(req_state) continue diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 5d7b181989ce5..0ced138b940d0 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -483,7 +483,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = self.requests[req_id] num_computed_tokens = req_data.num_computed_tokens[i] new_block_ids = req_data.new_block_ids[i] - resumed_from_preemption = req_data.resumed_from_preemption[i] + resumed_from_preemption = req_id in req_data.resumed_req_ids # Update the cached states. req_state.num_computed_tokens = num_computed_tokens From 17d055f527d2bd5d39a1352e5161ed82345466ac Mon Sep 17 00:00:00 2001 From: Benjamin Bartels Date: Thu, 30 Oct 2025 04:09:10 +0000 Subject: [PATCH 007/976] [Feat] Adds runai distributed streamer (#27230) Signed-off-by: bbartels Signed-off-by: Benjamin Bartels Co-authored-by: omer-dayan Co-authored-by: Cyrus Leung --- docker/Dockerfile | 2 +- docs/models/extensions/runai_model_streamer.md | 9 +++++++++ requirements/nightly_torch_test.txt | 2 +- requirements/rocm.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 6 +++--- setup.py | 2 +- .../model_loader/runai_streamer_loader.py | 10 ++++++++-- vllm/model_executor/model_loader/weight_utils.py | 15 ++++++++++++++- 9 files changed, 39 insertions(+), 11 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index eb1453126e6f4..42a830cb605ad 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -495,7 +495,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ else \ BITSANDBYTES_VERSION="0.46.1"; \ fi; \ - uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.14.0' + uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0' ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md index c2cf107263a03..fc9d5eec3803e 100644 --- a/docs/models/extensions/runai_model_streamer.md +++ b/docs/models/extensions/runai_model_streamer.md @@ -45,6 +45,15 @@ vllm serve s3://core-llm/Llama-3-8b \ You can tune parameters using `--model-loader-extra-config`: +You can tune `distributed` that controls whether distributed streaming should be used. This is currently only possible on CUDA and ROCM devices. This can significantly improve loading times from object storage or high-throughput network fileshares. +You can read further about Distributed streaming [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/usage.md#distributed-streaming) + +```bash +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \ + --load-format runai_streamer \ + --model-loader-extra-config '{"distributed":true}' +``` + You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer. For reading from S3, it will be the number of client instances the host is opening to the S3 server. diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index dea1926bbd695..63c1908f024b3 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -42,6 +42,6 @@ tritonclient==2.51.0 numba == 0.61.2 # Required for N-gram speculative decoding numpy -runai-model-streamer[s3,gcs]==0.14.0 +runai-model-streamer[s3,gcs]==0.15.0 fastsafetensors>=0.1.10 pydantic>=2.12 # 2.11 leads to error on python 3.13 diff --git a/requirements/rocm.txt b/requirements/rocm.txt index d9743f0446438..6f1cca90e5e2b 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -12,6 +12,6 @@ tensorizer==2.10.1 packaging>=24.2 setuptools>=77.0.3,<80.0.0 setuptools-scm>=8 -runai-model-streamer[s3,gcs]==0.14.0 +runai-model-streamer[s3,gcs]==0.15.0 conch-triton-kernels==1.2.1 timm>=1.0.17 diff --git a/requirements/test.in b/requirements/test.in index a79ec839dbec1..b1ab599ff16e5 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -50,7 +50,7 @@ tritonclient==2.51.0 numba == 0.61.2 # Required for N-gram speculative decoding numpy -runai-model-streamer[s3,gcs]==0.14.0 +runai-model-streamer[s3,gcs]==0.15.0 fastsafetensors>=0.1.10 pydantic>=2.12 # 2.11 leads to error on python 3.13 decord==0.6.0 diff --git a/requirements/test.txt b/requirements/test.txt index bc007ccf10bbb..e54bb49fde684 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -965,11 +965,11 @@ rsa==4.9.1 # via google-auth rtree==1.4.0 # via torchgeo -runai-model-streamer==0.14.0 +runai-model-streamer==0.15.0 # via -r requirements/test.in -runai-model-streamer-gcs==0.14.0 +runai-model-streamer-gcs==0.15.0 # via runai-model-streamer -runai-model-streamer-s3==0.14.0 +runai-model-streamer-s3==0.15.0 # via runai-model-streamer s3transfer==0.10.3 # via boto3 diff --git a/setup.py b/setup.py index 83a4e3eea57c8..8139d0d62b8ac 100644 --- a/setup.py +++ b/setup.py @@ -712,7 +712,7 @@ setup( "bench": ["pandas", "matplotlib", "seaborn", "datasets"], "tensorizer": ["tensorizer==2.10.1"], "fastsafetensors": ["fastsafetensors >= 0.1.10"], - "runai": ["runai-model-streamer[s3,gcs] >= 0.14.0"], + "runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"], "audio": [ "librosa", "soundfile", diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index 079e3168647bb..93da07c550195 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -27,9 +27,16 @@ class RunaiModelStreamerLoader(BaseModelLoader): def __init__(self, load_config: LoadConfig): super().__init__(load_config) + + self._is_distributed = False if load_config.model_loader_extra_config: extra_config = load_config.model_loader_extra_config + if "distributed" in extra_config and isinstance( + extra_config.get("distributed"), bool + ): + self._is_distributed = extra_config.get("distributed") + if "concurrency" in extra_config and isinstance( extra_config.get("concurrency"), int ): @@ -92,8 +99,7 @@ class RunaiModelStreamerLoader(BaseModelLoader): """Get an iterator for the model weights based on the load format.""" hf_weights_files = self._prepare_weights(model_or_path, revision) return runai_safetensors_weights_iterator( - hf_weights_files, - self.load_config.use_tqdm_on_load, + hf_weights_files, self.load_config.use_tqdm_on_load, self._is_distributed ) def download_model(self, model_config: ModelConfig) -> None: diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 5a9faefa4d894..3dbe803f99860 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -657,10 +657,22 @@ def multi_thread_safetensors_weights_iterator( def runai_safetensors_weights_iterator( hf_weights_files: list[str], use_tqdm_on_load: bool, + is_distributed: bool = False, ) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" with SafetensorsStreamer() as streamer: - streamer.stream_files(hf_weights_files) + is_cuda_alike = current_platform.is_cuda_alike() + device = ( + f"cuda:{current_platform.current_device()}" + if is_distributed and is_cuda_alike + else "cpu" + ) + + streamer.stream_files( + hf_weights_files, + device=device, + is_distributed=is_distributed, + ) total_tensors = sum( len(tensors_meta) for tensors_meta in streamer.files_to_tensors_metadata.values() @@ -672,6 +684,7 @@ def runai_safetensors_weights_iterator( desc="Loading safetensors using Runai Model Streamer", bar_format=_BAR_FORMAT, disable=not enable_tqdm(use_tqdm_on_load), + mininterval=2, ) yield from tensor_iter From b8c48c5d722298656074c559d0e8d702a6c28da1 Mon Sep 17 00:00:00 2001 From: Fardin Hoque Date: Wed, 29 Oct 2025 21:10:34 -0700 Subject: [PATCH 008/976] kernels/moe test pruning (#27053) Signed-off-by: Fardin Hoque Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- tests/kernels/moe/test_batched_moe.py | 25 +++++++++++++------ tests/kernels/moe/test_block_fp8.py | 14 ----------- tests/kernels/moe/test_block_int8.py | 10 +------- tests/kernels/moe/test_cutlass_moe.py | 3 --- tests/kernels/moe/test_deepep_deepgemm_moe.py | 1 - tests/kernels/moe/test_deepgemm.py | 2 -- tests/kernels/moe/test_flashinfer.py | 2 -- tests/kernels/moe/test_flashinfer_moe.py | 4 +-- tests/kernels/moe/test_grouped_topk.py | 2 +- .../moe/test_modular_kernel_combinations.py | 8 ++++++ tests/kernels/moe/test_moe.py | 11 +++----- tests/kernels/moe/test_nvfp4_moe.py | 4 +-- .../moe/test_silu_mul_fp8_quant_deep_gemm.py | 4 --- 13 files changed, 34 insertions(+), 56 deletions(-) diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 2dce099770f08..62704bbcbbc79 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -24,23 +24,16 @@ from vllm.triton_utils import tl MNK_FACTORS = [ (1, 128, 128), - (1, 128, 2048), (1, 512, 512), - (1, 1024, 128), (1, 1024, 2048), (32, 128, 128), (32, 512, 512), (32, 1024, 2048), - (45, 128, 128), (45, 128, 2048), - (45, 512, 512), (45, 1024, 128), - (45, 1024, 2048), (64, 512, 512), (64, 1024, 2048), - (222, 128, 128), (222, 128, 2048), - (222, 1024, 128), (222, 1024, 2048), ] NUM_EXPERTS = [8, 64] @@ -117,10 +110,19 @@ def test_batched_mm( block_shape: list[int] | None, per_act_token_quant: bool, ): + """Note: float8_e4m3fn is not supported on CUDA architecture < 89, + and those tests will be skipped on unsupported hardware.""" current_platform.seed_everything(7) use_fp8_w8a8 = dtype == torch.float8_e4m3fn + if (dtype == torch.float8_e4m3fn) and not current_platform.has_device_capability( + 89 + ): + pytest.skip( + "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" + ) + if (per_act_token_quant or block_shape is not None) and not use_fp8_w8a8: pytest.skip("Don't test blocking for non-quantized types.") @@ -244,10 +246,19 @@ def test_fused_moe_batched_experts( block_shape: list[int] | None, input_scales: bool, ): + """Note: float8_e4m3fn is not supported on CUDA architecture < 89, + and those tests will be skipped on unsupported hardware.""" current_platform.seed_everything(7) use_fp8_w8a8 = dtype == torch.float8_e4m3fn + if (dtype == torch.float8_e4m3fn) and not current_platform.has_device_capability( + 89 + ): + pytest.skip( + "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" + ) + if topk > e: pytest.skip("topk > e") diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 60f9f14b7f6f1..cd34617ee0fc4 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -42,57 +42,43 @@ DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] # and its hidden size is 7168. MNK_FACTORS = [ (1, 128, 128), - (1, 512, 512), (1, 128, 7168), (1, 1024, 7168), (1, 4608, 128), - (1, 4608, 512), (1, 4608, 7168), (83, 128, 128), (83, 512, 512), - (83, 1024, 7168), (83, 4608, 512), (83, 4608, 7168), - (128, 128, 128), (128, 512, 512), (128, 1024, 7168), - (128, 4608, 512), (128, 4608, 7168), (2048, 128, 128), (2048, 1024, 7168), (2048, 4608, 512), (2048, 4608, 7168), (8192, 128, 128), - (8192, 512, 512), (8192, 128, 7168), (8192, 1024, 7168), - (8192, 4608, 512), (8192, 4608, 7168), ] MNK_FACTORS_DG = [ (128, 128, 128), - (128, 512, 512), (128, 128, 7168), (128, 1024, 7168), (128, 4608, 128), - (128, 4608, 512), (128, 4608, 7168), - (192, 128, 128), (192, 512, 512), (192, 1024, 7168), - (192, 4608, 512), (192, 4608, 7168), (1335, 128, 128), (1335, 1024, 7168), (1335, 4608, 512), (1335, 4608, 7168), (2048, 128, 128), - (2048, 512, 512), (2048, 128, 7168), (2048, 1024, 7168), - (2048, 4608, 128), - (2048, 4608, 512), (2048, 4608, 7168), ] diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py index 74cc943714dd9..3799e60f1294a 100644 --- a/tests/kernels/moe/test_block_int8.py +++ b/tests/kernels/moe/test_block_int8.py @@ -21,36 +21,28 @@ vllm_config = VllmConfig() vllm_config.scheduler_config.max_num_seqs = 128 vllm_config.scheduler_config.max_model_len = 8192 -DTYPES = [torch.half, torch.bfloat16] +DTYPES = [torch.bfloat16] MNK_FACTORS = [ (1, 128, 128), - (1, 512, 512), (1, 128, 7168), (1, 1024, 7168), - (1, 4096, 128), (1, 4096, 512), (1, 4096, 7168), - (33, 128, 128), (33, 512, 512), (33, 128, 7168), (33, 1024, 7168), (33, 4096, 128), - (33, 4096, 512), (33, 4096, 7168), (128, 128, 128), - (128, 512, 512), (128, 1024, 7168), (128, 4096, 512), (128, 4096, 7168), - (222, 128, 128), (222, 512, 512), (222, 1024, 7168), - (222, 4096, 512), (222, 4096, 7168), (2048, 128, 128), (2048, 1024, 7168), - (2048, 4096, 512), (2048, 4096, 4096), ] diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 4330eda251f75..5512ccce47b05 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -26,16 +26,13 @@ TOP_KS = [6, 8] MNK_FACTORS = [ (2, 1024, 1024), - (2, 1024, 1536), (2, 3072, 1024), (2, 3072, 1536), (7, 3072, 1536), (64, 1024, 1024), (64, 1024, 1536), (64, 3072, 1024), - (64, 3072, 1536), (224, 1024, 1024), - (224, 1024, 1536), (224, 3072, 1024), (224, 3072, 1536), (32768, 1024, 1024), diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index d46f453488a98..9d039b81690a1 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -393,7 +393,6 @@ def _test_deepep_deepgemm_moe( MNKs = [ (8, 128, 128), (8, 128, 512), - (8, 512, 512), (3, 1024, 2048), (32, 128, 1024), (45, 512, 2048), diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index cad0085d5ba6e..9b1054f7d0ab8 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -130,10 +130,8 @@ def run_single_case(m, n, k, topk, num_experts, block_size): # Note: N <= 512 will disable the deepgemm path due to performance issues. MNKs = [ (1024, 768, 128), - (1024, 768, 512), (2048, 768, 512), (512, 1024, 1024), - (512, 2048, 2048), (4096, 4096, 1024), ] diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 0780232a82640..f985f9ac7ca67 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -34,8 +34,6 @@ TOP_KS = [1] MNK_FACTORS = [ (256, 8192, 5120), - (256, 4096, 5120), - (127, 8192, 5120), (127, 4096, 5120), (10, 8192, 5120), (10, 4096, 5120), diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py index 18cfd4f79092d..be3e36865d1a4 100644 --- a/tests/kernels/moe/test_flashinfer_moe.py +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -34,10 +34,8 @@ if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_cap MNK_FACTORS = [ (2, 1024, 1024), - (2, 1024, 1536), (2, 3072, 1024), (2, 3072, 1536), - (64, 1024, 1024), (64, 1024, 1536), (64, 3072, 1024), (64, 2048, 1536), @@ -49,7 +47,7 @@ MNK_FACTORS = [ @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", [40, 64, 256]) @pytest.mark.parametrize("topk", [1, 6, 8]) -@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @torch.inference_mode() def test_flashinfer_fp4_moe_no_graph( m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py index 3f4f142be7674..662e0723b7583 100644 --- a/tests/kernels/moe/test_grouped_topk.py +++ b/tests/kernels/moe/test_grouped_topk.py @@ -27,7 +27,7 @@ from vllm.platforms import current_platform @pytest.mark.parametrize("topk_group", [2]) @pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"]) @pytest.mark.parametrize("routed_scaling_factor", [1.0, 2.5]) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32]) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32]) def test_grouped_topk( monkeypatch: pytest.MonkeyPatch, n_token: int, diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index a46b0053e75a3..e3b8621b452fa 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -295,6 +295,8 @@ def test_modular_kernel_combinations_singlegpu( world_size: int, pytestconfig, ): + """Note: float8_e4m3fn is not supported on CUDA architecture < 89, + and those tests will be skipped on unsupported hardware.""" config = Config( Ms=Ms, K=k, @@ -309,6 +311,12 @@ def test_modular_kernel_combinations_singlegpu( world_size=world_size, ) + if ( + quant_config is not None and quant_config.quant_dtype == torch.float8_e4m3fn + ) and not current_platform.has_device_capability(89): + pytest.skip( + "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" + ) verbosity = pytestconfig.getoption("verbose") run(config, verbosity > 0) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 2c802ff4e6bd6..014df1fa111f2 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -66,8 +66,6 @@ FUSED_MOE_MNK_FACTORS = [ (1, 128, 128), (1, 2048, 128), (33, 2048, 128), - (222, 1024, 1024), - (32768, 128, 128), (32768, 2048, 511), (40000, 1024, 1024), ] @@ -76,7 +74,6 @@ FUSED_MOE_WN16_MNK_FACTORS = [ (1, 128, 128), (1, 1024, 1024), (32, 2048, 128), - (32, 1024, 1024), (222, 2048, 1024), ] @@ -512,8 +509,8 @@ def marlin_moe_generate_valid_test_cases(): e_list = [4, 12] topk_list = [2, 3] ep_size_list = [1, 4] - dtype_list = [torch.half, torch.bfloat16] - group_size_list = [-1, 16, 32, 128] + dtype_list = [torch.bfloat16] + group_size_list = [-1, 32, 128] act_order_list = [True, False] quant_type_list = [ scalar_types.float4_e2m1f, @@ -885,10 +882,10 @@ def test_batched_moe_align_block_size_opcheck(): ) -@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("m", [1, 33, 222]) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("k", [128, 511, 1024]) -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype): input = torch.randn((m, topk, k), device="cuda", dtype=dtype) diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index dae19c0b2b31b..aa544fe0e0f63 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -26,9 +26,7 @@ MNK_FACTORS = [ (2, 1024, 1024), (2, 1024, 1536), (2, 3072, 1024), - (2, 3072, 1536), (64, 1024, 1024), - (64, 1024, 1536), (64, 3072, 1024), (64, 2048, 1536), (224, 1024, 1024), @@ -39,7 +37,7 @@ MNK_FACTORS = [ @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", [40, 64, 256]) @pytest.mark.parametrize("topk", [1, 6, 8]) -@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) @torch.inference_mode() def test_cutlass_fp4_moe_no_graph( m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index 92e78ec2396dd..97a55c37b9a3e 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -19,20 +19,16 @@ CASES = [ (32, 64, 256, fp8_dtype), (17, 31, 768, fp8_dtype), (1, 1, 128 * 1, fp8_dtype), - (1, 1, 128 * 2, fp8_dtype), (1, 1, 128 * 3, fp8_dtype), (1, 1, 128 * 4, fp8_dtype), (8, 16, 128 * 1, fp8_dtype), (8, 16, 128 * 2, fp8_dtype), (8, 16, 128 * 3, fp8_dtype), - (8, 16, 128 * 4, fp8_dtype), (8, 64, 7168, fp8_dtype), (8, 128, 7168, fp8_dtype), - (8, 256, 7168, fp8_dtype), (8, 512, 7168, fp8_dtype), (8, 1024, 7168, fp8_dtype), (256, 8, 7168, fp8_dtype), - (256, 16, 7168, fp8_dtype), (256, 32, 7168, fp8_dtype), (256, 64, 7168, fp8_dtype), # Only add a few fnuz tests to help with long CI times. From b5d70751d82c272a72f105299ef24ae316c41ded Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 30 Oct 2025 12:39:34 +0800 Subject: [PATCH 009/976] [BugFix] Reordering extend logic fix (#27739) Signed-off-by: Lucas Wilkinson --- tests/v1/attention/test_batch_reordering.py | 21 ++++++++++++++++++--- vllm/v1/attention/backends/utils.py | 10 +++++----- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/tests/v1/attention/test_batch_reordering.py b/tests/v1/attention/test_batch_reordering.py index b271409b92955..e37219454222b 100644 --- a/tests/v1/attention/test_batch_reordering.py +++ b/tests/v1/attention/test_batch_reordering.py @@ -53,7 +53,7 @@ REORDER_TEST_CASES = { expected_modified=True, ), "already_ordered": ReorderTestCase( - requests=[(1, 10), (1, 20), (100, 100), (200, 200)], + requests=[(1, 10), (1, 20), (100, 100), (200, 0)], expected_order=[0, 1, 2, 3], expected_modified=False, ), @@ -74,15 +74,30 @@ REORDER_TEST_CASES = { expected_modified=True, ), "decode_extend_prefill": ReorderTestCase( - requests=[(100, 100), (10, 50), (1, 10)], + requests=[(100, 0), (10, 50), (1, 10)], expected_order=[2, 1, 0], expected_modified=True, ), "extend_prefill_only": ReorderTestCase( - requests=[(100, 100), (10, 50), (200, 200), (20, 75)], + requests=[(100, 0), (10, 50), (200, 0), (20, 75)], expected_order=[3, 1, 2, 0], # Only swap 0↔3, keep 1 and 2 in place expected_modified=True, ), + "complicated_mixed_interleaved": ReorderTestCase( + requests=[ + (1, 20), + (1, 50), + (374, 0), + (300, 20), + (1, 20), + (256, 0), + (1, 5), + (27, 0), + (1, 4), + ], + expected_order=[0, 1, 6, 8, 4, 3, 2, 7, 5], + expected_modified=True, + ), } diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 389baf1488be0..07d62e9849e00 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -811,8 +811,8 @@ def reorder_batch_to_split_decodes_and_prefills( num_computed_tokens_np = input_batch.num_computed_tokens_cpu[:num_reqs] is_decode = num_scheduled_tokens_np <= decode_threshold - is_extend = (~is_decode) & (num_computed_tokens_np > num_scheduled_tokens_np) - is_prefill = (~is_decode) & (num_computed_tokens_np == num_scheduled_tokens_np) + is_extend = (~is_decode) & (num_computed_tokens_np > 0) + is_prefill = (~is_decode) & (num_computed_tokens_np == 0) # Desired order: decode → extend → prefill req_regions = np.zeros(is_decode.shape, dtype=np.int32) # 0 = decode by default @@ -832,11 +832,11 @@ def reorder_batch_to_split_decodes_and_prefills( return False # Extract indices that need swapping and sort by target region - swap_indices = np.where(needs_swap)[0] + orig_indices = np.where(needs_swap)[0] sorted_order = np.argsort(req_regions[needs_swap], kind="stable") - dest_indices = swap_indices[sorted_order] + src_indices = orig_indices[sorted_order] - src_dest_map = {int(src): int(dst) for src, dst in zip(swap_indices, dest_indices)} + src_dest_map = {int(src): int(dst) for src, dst in zip(src_indices, orig_indices)} for src in src_dest_map: dst = src_dest_map[src] From 8bff831f0aa239006f34b721e63e1340e3472067 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 29 Oct 2025 21:43:37 -0700 Subject: [PATCH 010/976] [Benchmark] Cleanup deprecated nightly benchmark and adjust the docstring for performance benchmark (#25786) Signed-off-by: KuntaiDu --- .../benchmark-pipeline.yaml | 184 ------- .../nightly-benchmarks/nightly-annotation.md | 28 -- .../nightly-descriptions.md | 39 -- .../nightly-benchmarks/nightly-pipeline.yaml | 196 -------- .../scripts/download-tokenizer.py | 26 - .../scripts/generate-nightly-markdown.py | 97 ---- .../scripts/get-lmdeploy-modelname.py | 9 - .../scripts/nightly-annotate.sh | 78 --- .../scripts/run-nightly-benchmarks.sh | 464 ------------------ .../scripts/summary-nightly-results.py | 82 ---- .../scripts/wait-for-image.sh | 23 - .../README.md | 54 +- .../performance-benchmarks-descriptions.md | 0 .../scripts/compare-json-results.py | 0 .../convert-results-json-to-markdown.py | 2 +- .../scripts/launch-server.sh | 0 .../scripts/run-performance-benchmarks.sh | 2 +- .../tests/genai-perf-tests.json | 0 .../tests/latency-tests-cpu.json | 0 .../tests/latency-tests.json | 0 .../tests/nightly-tests.json | 0 .../tests/serving-tests-cpu-snc2.json | 0 .../tests/serving-tests-cpu-snc3.json | 0 .../tests/serving-tests-cpu.json | 0 .../tests/serving-tests.json | 0 .../tests/throughput-tests-cpu.json | 0 .../tests/throughput-tests.json | 0 .github/mergify.yml | 2 +- docs/contributing/benchmarks.md | 13 +- 29 files changed, 10 insertions(+), 1289 deletions(-) delete mode 100644 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml delete mode 100644 .buildkite/nightly-benchmarks/nightly-annotation.md delete mode 100644 .buildkite/nightly-benchmarks/nightly-descriptions.md delete mode 100644 .buildkite/nightly-benchmarks/nightly-pipeline.yaml delete mode 100644 .buildkite/nightly-benchmarks/scripts/download-tokenizer.py delete mode 100644 .buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py delete mode 100644 .buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py delete mode 100644 .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh delete mode 100644 .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh delete mode 100644 .buildkite/nightly-benchmarks/scripts/summary-nightly-results.py delete mode 100644 .buildkite/nightly-benchmarks/scripts/wait-for-image.sh rename .buildkite/{nightly-benchmarks => performance-benchmarks}/README.md (69%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/performance-benchmarks-descriptions.md (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/scripts/compare-json-results.py (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/scripts/convert-results-json-to-markdown.py (99%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/scripts/launch-server.sh (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/scripts/run-performance-benchmarks.sh (99%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/genai-perf-tests.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/latency-tests-cpu.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/latency-tests.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/nightly-tests.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/serving-tests-cpu-snc2.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/serving-tests-cpu-snc3.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/serving-tests-cpu.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/serving-tests.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/throughput-tests-cpu.json (100%) rename .buildkite/{nightly-benchmarks => performance-benchmarks}/tests/throughput-tests.json (100%) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml deleted file mode 100644 index 4259514940d3f..0000000000000 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ /dev/null @@ -1,184 +0,0 @@ -steps: - - label: "Wait for container to be ready" - key: wait-for-container-image - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - containers: - - image: badouralix/curl-jq - command: - - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh - - label: "Cleanup H100" - agents: - queue: H100 - depends_on: ~ - command: docker system prune -a --volumes --force - - - label: "A100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: A100 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - - label: "H200" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H200 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: 4,5,6,7 - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - #- block: "Run H100 Benchmark" - #key: block-h100 - #depends_on: ~ - - - label: "H100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H100 - depends_on: wait-for-container-image - if: build.branch == "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - # Premerge benchmark - - label: "A100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: A100 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - - label: "H200" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H200 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: 4,5,6,7 - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN - - #- block: "Run H100 Benchmark" - #key: block-h100 - #depends_on: ~ - - - label: "H100" - # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" - agents: - queue: H100 - depends_on: wait-for-container-image - if: build.branch != "main" - plugins: - - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - command: - - bash - - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh - mount-buildkite-agent: true - propagate-environment: true - ipc: host - gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used - volumes: - - /data/benchmark-hf-cache:/root/.cache/huggingface - environment: - - VLLM_USAGE_SOURCE - - HF_TOKEN diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md deleted file mode 100644 index 466def07b6f1f..0000000000000 --- a/.buildkite/nightly-benchmarks/nightly-annotation.md +++ /dev/null @@ -1,28 +0,0 @@ -# Nightly benchmark annotation - -## Description - -This file contains the downloading link for benchmarking results. - -- [benchmarking pipeline](artifact://nightly-pipeline.yaml) -- [benchmarking results](artifact://results.zip) -- [benchmarking code](artifact://nightly-benchmarks.zip) - -Please download the visualization scripts in the post - -## Results reproduction - -- Find the docker we use in `benchmarking pipeline` -- Deploy the docker, and inside the docker: - - Download `nightly-benchmarks.zip`. - - In the same folder, run the following code: - - ```bash - export HF_TOKEN= - apt update - apt install -y git - unzip nightly-benchmarks.zip - VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - ``` - -And the results will be inside `./benchmarks/results`. diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md deleted file mode 100644 index 2ef36089b6afb..0000000000000 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ /dev/null @@ -1,39 +0,0 @@ - -# Nightly benchmark - -This benchmark aims to: - -- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload. -- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions. - -Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. - -Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176) - -## Setup - -- Docker images: - - vLLM: `vllm/vllm-openai:v0.6.2` - - SGLang: `lmsysorg/sglang:v0.3.2-cu121` - - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` - - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` - - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.* - - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. -- Hardware - - 8x Nvidia A100 GPUs -- Workload: - - Dataset - - ShareGPT dataset - - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) - - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) - - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. - - Models: llama-3 8B, llama-3 70B. - - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). - - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. - - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. - - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). - -## Known issues - -- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105). -- TGI does not support `ignore-eos` flag. diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml deleted file mode 100644 index 199517e8b067c..0000000000000 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ /dev/null @@ -1,196 +0,0 @@ -common_pod_spec: &common_pod_spec - priorityClassName: perf-benchmark - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory - -common_container_settings: &common_container_settings - command: - - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - -steps: - - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - - - - - label: "A100 vllm step 10" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: vllm/vllm-openai:v0.6.2 - <<: *common_container_settings - - - - - label: "A100 sglang benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: lmsysorg/sglang:v0.3.2-cu121 - <<: *common_container_settings - - - label: "A100 lmdeploy benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: openmmlab/lmdeploy:v0.6.1-cu12 - <<: *common_container_settings - - - - - - label: "A100 trt llama-8B" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - <<: *common_container_settings - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - name: TEST_SELECTOR - value: "llama8B" - - - - label: "A100 trt llama-70B" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - <<: *common_container_settings - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - name: TEST_SELECTOR - value: "llama70B" - - - # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image - # - label: "A100 trt benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # <<: *common_pod_spec - # containers: - # - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 - # <<: *common_container_settings - - - # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`. - # - label: "A100 tgi benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # <<: *common_pod_spec - # containers: - # - image: ghcr.io/huggingface/text-generation-inference:2.2.0 - # <<: *common_container_settings - - - wait - - - label: "Collect the results" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - <<: *common_pod_spec - containers: - - image: vllm/vllm-openai:v0.5.0.post1 - command: - - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - - - block: ":rocket: check the results!" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py deleted file mode 100644 index 8532ff7ef798c..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse - -from transformers import AutoTokenizer - - -def main(model, cachedir): - # Load the tokenizer and save it to the specified directory - tokenizer = AutoTokenizer.from_pretrained(model) - tokenizer.save_pretrained(cachedir) - print(f"Tokenizer saved to {cachedir}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Download and save Hugging Face tokenizer" - ) - parser.add_argument("--model", type=str, required=True, help="Name of the model") - parser.add_argument( - "--cachedir", type=str, required=True, help="Directory to save the tokenizer" - ) - - args = parser.parse_args() - main(args.model, args.cachedir) diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py deleted file mode 100644 index 053fd52c35ae9..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import json -from pathlib import Path - -import numpy as np -import pandas as pd -from tabulate import tabulate - - -def parse_arguments(): - parser = argparse.ArgumentParser( - description="Parse command line arguments for summary-nightly-results script." - ) - parser.add_argument( - "--results-folder", - type=str, - required=True, - help="The folder where the results are stored.", - ) - parser.add_argument( - "--description", type=str, required=True, help="Description of the results." - ) - - args = parser.parse_args() - return args - - -def get_perf(df, method, model, metric): - means = [] - - for qps in [2, 4, 8, 16, "inf"]: - target = df["Test name"].str.contains(model) - target = target & df["Engine"].str.contains(method) - target = target & df["Test name"].str.contains("qps_" + str(qps)) - filtered_df = df[target] - - if filtered_df.empty: - means.append(0.0) - else: - means.append(filtered_df[metric].values[0]) - - return np.array(means) - - -def get_perf_w_std(df, method, model, metric): - if metric in ["TTFT", "ITL"]: - mean = get_perf(df, method, model, "Mean " + metric + " (ms)") - mean = mean.tolist() - std = get_perf(df, method, model, "Std " + metric + " (ms)") - if std.mean() == 0: - std = None - success = get_perf(df, method, model, "Successful req.") - if std is not None: - std = std / np.sqrt(success) - std = std.tolist() - - else: - assert metric == "Tput" - mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( - df, method, model, "Output Tput (tok/s)" - ) - mean = mean.tolist() - std = None - - return mean, std - - -def main(args): - results_folder = Path(args.results_folder) - - results = [] - - # collect results - for test_file in results_folder.glob("*_nightly_results.json"): - with open(test_file) as f: - results = results + json.loads(f.read()) - - # generate markdown table - df = pd.DataFrame.from_dict(results) - - md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False) - - with open(args.description) as f: - description = f.read() - - description = description.format(nightly_results_benchmarking_table=md_table) - - with open("nightly_results.md", "w") as f: - f.write(description) - - -if __name__ == "__main__": - args = parse_arguments() - main(args) diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py deleted file mode 100644 index ddea1d2b1b1ed..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from lmdeploy.serve.openai.api_client import APIClient - -api_client = APIClient("http://localhost:8000") -model_name = api_client.available_models[0] - -print(model_name) diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh deleted file mode 100644 index 69b6b146b3549..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -set -ex -set -o pipefail - - -main() { - - (which wget && which curl) || (apt-get update && apt-get install -y wget curl) - (which jq) || (apt-get update && apt-get -y install jq) - (which zip) || (apt-get install -y zip) - - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip plotting the results." - exit 0 - fi - - # initial annotation - #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" - - # download results - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - mkdir -p results/ - /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ - ls - ls results/ - - # upload benchmark results - zip -r results.zip results/ - /workspace/buildkite-agent artifact upload "results.zip" - - # upload benchmarking scripts - cd "$VLLM_SOURCE_CODE_LOC/" - zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ - /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" - - cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - # upload benchmarking pipeline - /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" - - cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md - - - - # The figures should be generated by a separate process outside the CI/CD pipeline - - # # generate figures - # python3 -m pip install tabulate pandas matplotlib - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \ - # --description $description \ - # --results-folder results/ - - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sharegpt - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sonnet_2048_128 - - # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ - # --description $description \ - # --results-folder results/ \ - # --dataset sonnet_128_2048 - - # # upload results and figures - # /workspace/buildkite-agent artifact upload "nightly_results*.png" - # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml - # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json - # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh deleted file mode 100644 index a00de940cbbb8..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ /dev/null @@ -1,464 +0,0 @@ -#!/bin/bash - -set -o pipefail -set -x - -check_gpus() { - # check the number of GPUs and GPU type. - declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [[ $gpu_count -gt 0 ]]; then - echo "GPU found." - else - echo "Need at least 1 GPU to run benchmarking." - exit 1 - fi - declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')" - echo "GPU type is $gpu_type" -} - -check_hf_token() { - # check if HF_TOKEN is available and valid - if [[ -z "$HF_TOKEN" ]]; then - echo "Error: HF_TOKEN is not set." - exit 1 - elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then - echo "Error: HF_TOKEN does not start with 'hf_'." - exit 1 - else - echo "HF_TOKEN is set and valid." - fi -} - - -upload_to_buildkite() { - # upload the benchmarking results to buildkite - - # if the agent binary is not found, skip uploading the results, exit 0 - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - fi - # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md - /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" -} - - -get_current_llm_serving_engine() { - - if which lmdeploy >/dev/null; then - echo "Container: lmdeploy" - export CURRENT_LLM_SERVING_ENGINE=lmdeploy - return - fi - - if [ -e /tgi-entrypoint.sh ]; then - echo "Container: tgi" - export CURRENT_LLM_SERVING_ENGINE=tgi - return - fi - - if which trtllm-build >/dev/null; then - echo "Container: tensorrt-llm" - export CURRENT_LLM_SERVING_ENGINE=trt - return - fi - - if [ -e /sgl-workspace ]; then - echo "Container: sglang" - export CURRENT_LLM_SERVING_ENGINE=sglang - return - fi - - if [ -e /vllm-workspace ]; then - echo "Container: vllm" - # move to a completely irrelevant directory, to avoid import vllm from current folder - export CURRENT_LLM_SERVING_ENGINE=vllm - - return - fi -} - -json2args() { - # transforms the JSON string to command line args, and '_' is replaced to '-' - # example: - # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } - # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 - local json_string=$1 - local args=$( - echo "$json_string" | jq -r ' - to_entries | - map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | - join(" ") - ' - ) - echo "$args" -} - -kill_gpu_processes() { - pkill -f '[p]ython' - pkill -f '[p]ython3' - pkill -f '[t]ritonserver' - pkill -f '[p]t_main_thread' - pkill -f '[t]ext-generation' - pkill -f '[l]mdeploy' - # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 - pkill -f '[V]LLM' - - while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do - sleep 1 - done -} - -wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes - timeout 1200 bash -c ' - until curl -s localhost:8000/v1/completions > /dev/null; do - sleep 1 - done' && return 0 || return 1 -} - -ensure_installed() { - # Ensure that the given command is installed by apt-get - local cmd=$1 - if ! which "$cmd" >/dev/null; then - apt-get update && apt-get install -y "$cmd" - fi -} - -run_serving_tests() { - # run serving tests using `vllm bench serve` command - # $1: a json file specifying serving test cases - - local serving_test_file - serving_test_file=$1 - - # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # prepend the current serving engine to the test name - test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - reuse_server=$(echo "$common_params" | jq -r '.reuse_server') - - # get client and server arguments - server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") - client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters") - client_args=$(json2args "$client_params") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if [[ $reuse_server == "true" ]]; then - echo "Reuse previous server for test case $test_name" - else - kill_gpu_processes - bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ - "$server_params" "$common_params" - fi - - if wait_for_server; then - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." - else - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." - break - fi - - # prepare tokenizer - # this is required for lmdeploy. - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - rm -rf /tokenizer_cache - mkdir /tokenizer_cache - python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ - --model "$model" \ - --cachedir /tokenizer_cache - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - - - # change model name for lmdeploy (it will not follow standard hf name) - if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then - model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps="inf" - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - - backend=$CURRENT_LLM_SERVING_ENGINE - - if [[ $backend = "trt" ]]; then - backend="tensorrt-llm" - fi - - if [[ "$backend" == *"vllm"* ]]; then - backend="vllm" - fi - - if [[ "$dataset_name" = "sharegpt" ]]; then - - client_command="vllm bench serve \ - --backend $backend \ - --tokenizer /tokenizer_cache \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --ignore-eos \ - $client_args" - - elif [[ "$dataset_name" = "sonnet" ]]; then - - sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len') - sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') - sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') - - client_command="vllm bench serve \ - --backend $backend \ - --tokenizer /tokenizer_cache \ - --model $model \ - --dataset-name $dataset_name \ - --dataset-path $dataset_path \ - --num-prompts $num_prompts \ - --sonnet-input-len $sonnet_input_len \ - --sonnet-output-len $sonnet_output_len \ - --sonnet-prefix-len $sonnet_prefix_len \ - --port $port \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --ignore-eos \ - $client_args" - - else - - echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." - exit 1 - - fi - - - - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" - - eval "$client_command" - - server_command="None" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - --arg engine "$CURRENT_LLM_SERVING_ENGINE" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu, - engine: $engine - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" - - done - - done - - kill_gpu_processes -} - -run_genai_perf_tests() { - # run genai-perf tests - - # $1: a json file specifying genai-perf test cases - local genai_perf_test_file - genai_perf_test_file=$1 - - # Iterate over genai-perf tests - jq -c '.[]' "$genai_perf_test_file" | while read -r params; do - # get the test name, and append the GPU type back to it. - test_name=$(echo "$params" | jq -r '.test_name') - - # if TEST_SELECTOR is set, only run the test cases that match the selector - if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then - echo "Skip test case $test_name." - continue - fi - - # prepend the current serving engine to the test name - test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} - - # get common parameters - common_params=$(echo "$params" | jq -r '.common_parameters') - model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') - port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') - reuse_server=$(echo "$common_params" | jq -r '.reuse_server') - - # get client and server arguments - server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") - qps_list=$(echo "$params" | jq -r '.qps_list') - qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') - echo "Running over qps list $qps_list" - - # check if there is enough GPU to run the test - if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." - continue - fi - - if [[ $reuse_server == "true" ]]; then - echo "Reuse previous server for test case $test_name" - else - kill_gpu_processes - bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ - "$server_params" "$common_params" - fi - - if wait_for_server; then - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." - else - echo "" - echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." - break - fi - - # iterate over different QPS - for qps in $qps_list; do - # remove the surrounding single quote from qps - if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" - qps=$num_prompts - echo "now qps is $qps" - fi - - new_test_name=$test_name"_qps_"$qps - backend=$CURRENT_LLM_SERVING_ENGINE - - if [[ "$backend" == *"vllm"* ]]; then - backend="vllm" - fi - #TODO: add output dir. - client_command="genai-perf profile \ - -m $model \ - --service-kind openai \ - --backend "$backend" \ - --endpoint-type chat \ - --streaming \ - --url localhost:$port \ - --request-rate $qps \ - --num-prompts $num_prompts \ - " - - echo "Client command: $client_command" - - eval "$client_command" - - #TODO: process/record outputs - done - done - - kill_gpu_processes - -} - -prepare_dataset() { - - # download sharegpt dataset - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - # duplicate sonnet by 4x, to allow benchmarking with input length 2048 - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - echo "" > sonnet_4x.txt - for _ in {1..4} - do - cat sonnet.txt >> sonnet_4x.txt - done - -} - -main() { - - # check if the environment variable is successfully injected from yaml - - check_gpus - check_hf_token - get_current_llm_serving_engine - - pip install -U transformers - - pip install -r requirements/dev.txt - which genai-perf - - # check storage - df -h - - ensure_installed wget - ensure_installed curl - ensure_installed jq - # genai-perf dependency - ensure_installed libb64-0d - - prepare_dataset - - cd "$VLLM_SOURCE_CODE_LOC/benchmarks" - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" - - # run the test - run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" - - # run genai-perf tests - run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" - mv artifacts/ $RESULTS_FOLDER/ - - # upload benchmark results to buildkite - python3 -m pip install tabulate pandas - python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" - upload_to_buildkite - -} - -main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py deleted file mode 100644 index fb3b9d5e34e03..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ /dev/null @@ -1,82 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import datetime -import json -import os -from pathlib import Path - -import pandas as pd -from tabulate import tabulate - -results_folder = Path("results/") - -# serving results and the keys that will be printed into markdown -serving_results = [] -serving_column_mapping = { - "test_name": "Test name", - "gpu_type": "GPU", - "completed": "Successful req.", - "request_throughput": "Tput (req/s)", - "mean_ttft_ms": "Mean TTFT (ms)", - "std_ttft_ms": "Std TTFT (ms)", - "median_ttft_ms": "Median TTFT (ms)", - "mean_itl_ms": "Mean ITL (ms)", - "std_itl_ms": "Std ITL (ms)", - "median_itl_ms": "Median ITL (ms)", - "mean_tpot_ms": "Mean TPOT (ms)", - "std_tpot_ms": "Std TPOT (ms)", - "median_tpot_ms": "Median TPOT (ms)", - "total_token_throughput": "Total Token Tput (tok/s)", - "output_throughput": "Output Tput (tok/s)", - "total_input_tokens": "Total input tokens", - "total_output_tokens": "Total output tokens", - "engine": "Engine", -} - -if __name__ == "__main__": - # collect results - for test_file in results_folder.glob("*.json"): - with open(test_file) as f: - raw_result = json.loads(f.read()) - - # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands")) as f: - command = json.loads(f.read()) - raw_result.update(command) - - # update the test name of this result - raw_result.update({"test_name": test_file.stem}) - - # add the result to raw_result - serving_results.append(raw_result) - continue - - serving_results = pd.DataFrame.from_dict(serving_results) - - if not serving_results.empty: - serving_results = serving_results[list(serving_column_mapping.keys())].rename( - columns=serving_column_mapping - ) - - serving_md_table_with_headers = tabulate( - serving_results, headers="keys", tablefmt="pipe", showindex=False - ) - # remove the first line of header - serving_md_table_lines = serving_md_table_with_headers.split("\n") - serving_md_table_without_header = "\n".join(serving_md_table_lines[2:]) - - prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") - - # document benchmarking results in markdown - with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: - # document results with header. - # for those who wants to reproduce our benchmark. - f.write(serving_md_table_with_headers) - f.write("\n") - - # document benchmarking results in json - with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: - results = serving_results.to_dict(orient="records") - f.write(json.dumps(results)) diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh deleted file mode 100644 index 50e1ab0242202..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh -TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) -if [[ "$BUILDKITE_BRANCH" == "main" ]]; then - URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" -else - URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" -fi - -TIMEOUT_SECONDS=10 - -retries=0 -while [ $retries -lt 1000 ]; do - if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then - exit 0 - fi - - echo "Waiting for image to be available..." - - retries=$((retries + 1)) - sleep 5 -done - -exit 1 diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md similarity index 69% rename from .buildkite/nightly-benchmarks/README.md rename to .buildkite/performance-benchmarks/README.md index e6f5c8b60f459..332142ba5d170 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -2,40 +2,23 @@ ## Introduction -This directory contains two sets of benchmark for vllm. - -- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance -- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. - -See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. +This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance. +vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD. ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models. **Benchmarking Duration**: about 1hr. **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. -## Nightly benchmark quick overview - -**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. - -**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy. - -**Benchmarking Duration**: about 3.5hrs. - ## Trigger the benchmark -Performance benchmark will be triggered when: - -- A PR being merged into vllm. -- Every commit for those PRs with `perf-benchmarks` label AND `ready` label. - -Manually Trigger the benchmark +The benchmark needs to be triggered manually: ```bash -bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh ``` Runtime environment variables: @@ -47,10 +30,6 @@ Runtime environment variables: - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. -Nightly benchmark will be triggered when: - -- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. - ## Performance benchmark details See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. @@ -152,26 +131,3 @@ Here is an example using the script to compare result_a and result_b with Model, A comparison diagram will be generated below the table. Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3 image - -## Nightly test details - -See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. - -### Workflow - -- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. -- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container. -- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`. -- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. - -### Nightly tests - -In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark. - -### Docker containers - -The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. - -WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`. - -WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md similarity index 100% rename from .buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md rename to .buildkite/performance-benchmarks/performance-benchmarks-descriptions.md diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py similarity index 100% rename from .buildkite/nightly-benchmarks/scripts/compare-json-results.py rename to .buildkite/performance-benchmarks/scripts/compare-json-results.py diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py similarity index 99% rename from .buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py rename to .buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py index a7544aeef4c74..80bb4d846a226 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py @@ -392,7 +392,7 @@ if __name__ == "__main__": json_file = "benchmark_results.json" with open(results_folder / md_file, "w") as f: results = read_markdown( - "../.buildkite/nightly-benchmarks/" + "../.buildkite/performance-benchmarks/" + "performance-benchmarks-descriptions.md" ) results = results.format( diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/performance-benchmarks/scripts/launch-server.sh similarity index 100% rename from .buildkite/nightly-benchmarks/scripts/launch-server.sh rename to .buildkite/performance-benchmarks/scripts/launch-server.sh diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh similarity index 99% rename from .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh rename to .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh index 5a47576483bbf..9447ceffd7e22 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -469,7 +469,7 @@ main() { ensure_sharegpt_downloaded declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER - QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/ # dump vllm info via vllm collect-env env_output=$(vllm collect-env) diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/performance-benchmarks/tests/genai-perf-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/genai-perf-tests.json rename to .buildkite/performance-benchmarks/tests/genai-perf-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-cpu.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/latency-tests-cpu.json rename to .buildkite/performance-benchmarks/tests/latency-tests-cpu.json diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/performance-benchmarks/tests/latency-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/latency-tests.json rename to .buildkite/performance-benchmarks/tests/latency-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/performance-benchmarks/tests/nightly-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/nightly-tests.json rename to .buildkite/performance-benchmarks/tests/nightly-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests-cpu.json rename to .buildkite/performance-benchmarks/tests/serving-tests-cpu.json diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/performance-benchmarks/tests/serving-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/serving-tests.json rename to .buildkite/performance-benchmarks/tests/serving-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json rename to .buildkite/performance-benchmarks/tests/throughput-tests-cpu.json diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/performance-benchmarks/tests/throughput-tests.json similarity index 100% rename from .buildkite/nightly-benchmarks/tests/throughput-tests.json rename to .buildkite/performance-benchmarks/tests/throughput-tests.json diff --git a/.github/mergify.yml b/.github/mergify.yml index de1a8314a4ecd..18d4a2e83144b 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -108,7 +108,7 @@ pull_request_rules: - files~=^benchmarks/ - files~=^vllm/benchmarks/ - files~=^tests/benchmarks/ - - files~=^\.buildkite/nightly-benchmarks/ + - files~=^\.buildkite/performance-benchmarks/ actions: label: add: diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index be3e32a73a332..dca01eab5b426 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -9,7 +9,6 @@ vLLM provides comprehensive benchmarking tools for performance testing and evalu - **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing - **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations - **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development -- **[Nightly benchmarks](#nightly-benchmarks)**: Comparative benchmarks against alternatives [Benchmark CLI]: #benchmark-cli @@ -1167,7 +1166,7 @@ docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingf Then, run below command inside the docker instance. ```bash -bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh ``` When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json. @@ -1185,7 +1184,7 @@ For more results visualization, check the [visualizing the results](https://gith The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). -More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). +More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md). ### Continuous Benchmarking @@ -1210,11 +1209,3 @@ The benchmarking currently runs on a predefined set of models configured in the #### Viewing Results All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). - -## Nightly Benchmarks - -These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels. - -The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html). - -More information on the nightly benchmarks and their parameters can be found [here](../../.buildkite/nightly-benchmarks/nightly-descriptions.md). From ded8ada86a3962477433054debbcef1d45161850 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Thu, 30 Oct 2025 01:28:45 -0400 Subject: [PATCH 011/976] Add more dims for batch invariant shims (#27489) Signed-off-by: Bram Wasti Signed-off-by: Bram Wasti Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/batch_invariant.py | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 208ffb30e5ed2..5706786bccb1d 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -478,9 +478,48 @@ def matmul_batch_invariant(a, b, *, out=None): elif a.ndim == 3 and b.ndim == 3: # Handle batched case like bmm return bmm_batch_invariant(a, b, out=out) + elif a.ndim == 3 and b.ndim == 2: + # Handle 3D x 2D: common for linear layers + # (batch, seq, hidden) @ (hidden, out) -> (batch, seq, out) + # Reshape to 2D, do mm, reshape back + batch, seq, hidden = a.shape + a_2d = a.reshape(-1, hidden) + result_2d = matmul_persistent(a_2d, b) + result = result_2d.reshape(batch, seq, -1) + if out is not None: + out.copy_(result) + return out + return result + elif a.ndim == 2 and b.ndim == 3: + # Handle 2D x 3D: (M, K) @ (B, K, N) -> (B, M, N) + # By broadcasting `a` to 3D, we can reuse the batched matrix + # multiplication logic. + a_expanded = a.unsqueeze(0).expand(b.shape[0], -1, -1) + return bmm_batch_invariant(a_expanded, b, out=out) + elif a.ndim == 4 and b.ndim == 4: + # Handle 4D attention tensors: [batch, heads, seq, dim] + # Reshape to 3D, process, reshape back + batch, heads, seq_a, dim_a = a.shape + _, _, dim_b, seq_b = b.shape + + # Reshape to [batch*heads, seq_a, dim_a] + a_3d = a.reshape(batch * heads, seq_a, dim_a) + b_3d = b.reshape(batch * heads, dim_b, seq_b) + + # Do batched matmul + result_3d = bmm_batch_invariant(a_3d, b_3d) + + # Reshape back to [batch, heads, seq_a, seq_b] + result = result_3d.reshape(batch, heads, seq_a, seq_b) + + if out is not None: + out.copy_(result) + return out + return result else: raise ValueError( - f"matmul_batch_invariant currently only supports 2D x 2D and 3D x 3D, " + f"matmul_batch_invariant currently only supports 2D x 2D, 3D x 3D, " + f"3D x 2D, 2D x 3D, and 4D x 4D, " f"got shapes {a.shape} and {b.shape}" ) @@ -667,7 +706,8 @@ def rms_norm_batch_invariant( def linear_batch_invariant(input, weight, bias=None): - output = mm_batch_invariant(input, weight.t()) + output = matmul_batch_invariant(input, weight.t()) + if bias is not None: output = output + bias return output From 31b55ffc62189b32dac15fb7c00dba20e3573168 Mon Sep 17 00:00:00 2001 From: yitingdc <59356937+yitingdc@users.noreply.github.com> Date: Thu, 30 Oct 2025 15:47:36 +0800 Subject: [PATCH 012/976] use stringData in secret yaml to store huggingface token (#25685) Signed-off-by: yiting.jiang --- docs/deployment/k8s.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md index 54031ec368b5c..abffb7bc5f948 100644 --- a/docs/deployment/k8s.md +++ b/docs/deployment/k8s.md @@ -49,11 +49,14 @@ First, create a Kubernetes PVC and Secret for downloading and storing Hugging Fa metadata: name: hf-token-secret type: Opaque - data: - token: $(HF_TOKEN) + stringData: + token: "REPLACE_WITH_TOKEN" EOF ``` +Here, the `token` field stores your **Hugging Face access token**. For details on how to generate a token, +see the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens). + Next, start the vLLM server as a Kubernetes Deployment and Service: ??? console "Config" From 5be1bed79058ddc1016f2639c52dfb5b597bf39c Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Thu, 30 Oct 2025 00:50:56 -0700 Subject: [PATCH 013/976] [CI/Build]Add eval config for Qwen3-235B-A22B-Instruct-2507-FP8 (#27113) Signed-off-by: Huamin Li <3ericli@gmail.com> --- .../configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml | 14 ++++++++++++++ .../lm-eval-harness/configs/models-large-h100.txt | 1 - .../configs/models-large-hopper.txt | 1 + .../lm-eval-harness/test_lm_eval_correctness.py | 14 +++++++++++--- .buildkite/test-pipeline.yaml | 13 +++++++++++++ 5 files changed, 39 insertions(+), 4 deletions(-) create mode 100644 .buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml delete mode 100644 .buildkite/lm-eval-harness/configs/models-large-h100.txt create mode 100644 .buildkite/lm-eval-harness/configs/models-large-hopper.txt diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml new file mode 100644 index 0000000000000..514c15d6098ed --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml @@ -0,0 +1,14 @@ +model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8" +tasks: + - name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.82 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +num_fewshot: 5 +enforce_eager: false # we use false to speed up the eval process +kv_cache_dtype: fp8 # we use fp8 to speed up the eval process +max_model_len: 40960 +apply_chat_template: true +fewshot_as_multiturn: true +gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>" diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt deleted file mode 100644 index 4fb0b84bc4d81..0000000000000 --- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt +++ /dev/null @@ -1 +0,0 @@ -Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-large-hopper.txt b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt new file mode 100644 index 0000000000000..5552391d9eaba --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-hopper.txt @@ -0,0 +1 @@ +Qwen3-235B-A22B-Instruct-2507-FP8.yaml diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index f10de82b1d8e8..3627b760eddcf 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -21,10 +21,13 @@ def launch_lm_eval(eval_config, tp_size): max_model_len = eval_config.get("max_model_len", 4096) batch_size = eval_config.get("batch_size", "auto") backend = eval_config.get("backend", "vllm") + enforce_eager = eval_config.get("enforce_eager", "true") + kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto") model_args = ( f"pretrained={eval_config['model_name']}," f"tensor_parallel_size={tp_size}," - f"enforce_eager=true," + f"enforce_eager={enforce_eager}," + f"kv_cache_dtype={kv_cache_dtype}," f"add_bos_token=true," f"trust_remote_code={trust_remote_code}," f"max_model_len={max_model_len}," @@ -37,8 +40,13 @@ def launch_lm_eval(eval_config, tp_size): limit=eval_config["limit"], # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help # text models. however, this is regressing measured strict-match for - # existing text models in CI, so only apply it for mm. - apply_chat_template=backend == "vllm-vlm", + # existing text models in CI, so only apply it for mm, or explicitly set + apply_chat_template=eval_config.get( + "apply_chat_template", backend == "vllm-vlm" + ), + fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), + # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) + gen_kwargs=eval_config.get("gen_kwargs"), batch_size=batch_size, ) return results diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d556073cd1049..339e3aab6c031 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1186,6 +1186,19 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 +##### H100 test ##### +- label: LM Eval Large Models (H100) # optional + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + ##### H200 test ##### - label: Distributed Tests (H200) # optional gpu: h200 From e806178d2a9b65ebd536342d58097a825d066b9e Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Thu, 30 Oct 2025 00:54:44 -0700 Subject: [PATCH 014/976] [BugFix][VL] Fix FA selection on Qwen2.5-VL (#27790) Signed-off-by: zhewenli Co-authored-by: Roger Wang --- .buildkite/test-amd.yaml | 2 +- vllm/model_executor/models/qwen2_5_vl.py | 30 +++++++++++++++--------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 56e7b1083b17e..35bd4c99adb78 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -318,7 +318,7 @@ steps: - label: V1 Test entrypoints # 35min timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index dfaeb663bbe2f..3d67653726bd8 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -43,10 +43,7 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( ) from vllm.attention.backends.registry import _Backend -from vllm.attention.layer import ( - check_upstream_fa_availability, - maybe_get_vit_flash_attn_backend, -) +from vllm.attention.layer import maybe_get_vit_flash_attn_backend from vllm.attention.ops.vit_attn_wrappers import ( vit_flash_attn_wrapper, vit_xformers_attn_wrapper, @@ -318,6 +315,7 @@ class Qwen2_5_VisionAttention(nn.Module): use_data_parallel: bool = False, attn_backend: _Backend = _Backend.TORCH_SDPA, use_upstream_fa: bool = False, + attn_backend_override: _Backend | None = None, ) -> None: super().__init__() # Per attention head and per partition values. @@ -358,8 +356,14 @@ class Qwen2_5_VisionAttention(nn.Module): maybe_get_vit_flash_attn_backend( self.attn_backend, self.use_upstream_fa, + attn_backend_override=attn_backend_override, ) ) + # On ROCm with FLASH_ATTN backend, upstream flash_attn is used + from vllm.platforms import current_platform + + if current_platform.is_rocm() and self.attn_backend == _Backend.FLASH_ATTN: + self.use_upstream_fa = True self.is_flash_attn_backend = self.attn_backend in { _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA, @@ -484,6 +488,7 @@ class Qwen2_5_VisionBlock(nn.Module): use_data_parallel: bool = False, attn_backend: _Backend = _Backend.TORCH_SDPA, use_upstream_fa: bool = False, + attn_backend_override: _Backend | None = None, ) -> None: super().__init__() if norm_layer is None: @@ -499,6 +504,7 @@ class Qwen2_5_VisionBlock(nn.Module): use_data_parallel=use_data_parallel, attn_backend=attn_backend, use_upstream_fa=use_upstream_fa, + attn_backend_override=attn_backend_override, ) self.mlp = Qwen2_5_VisionMLP( dim, @@ -698,13 +704,14 @@ class Qwen2_5_VisionTransformer(nn.Module): dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - if ( - self.attn_backend != _Backend.FLASH_ATTN - and self.attn_backend != _Backend.ROCM_AITER_FA - and check_upstream_fa_availability(torch.get_default_dtype()) - ): - self.attn_backend = _Backend.FLASH_ATTN - use_upstream_fa = True + + self.attn_backend, self.flash_attn_varlen_func = ( + maybe_get_vit_flash_attn_backend( + self.attn_backend, + use_upstream_fa, + attn_backend_override=attn_backend_override, + ) + ) if self.attn_backend not in { _Backend.FLASH_ATTN, @@ -730,6 +737,7 @@ class Qwen2_5_VisionTransformer(nn.Module): use_data_parallel=use_data_parallel, attn_backend=self.attn_backend, use_upstream_fa=use_upstream_fa, + attn_backend_override=attn_backend_override, ) for layer_idx in range(depth) ] From af826e082045e8bcd3ab2ea3129bcf91da7d58de Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Thu, 30 Oct 2025 17:42:49 +0800 Subject: [PATCH 015/976] [V0 deprecation] Remove VLLM_USE_V1 usage in config module (#27784) Signed-off-by: wangxiyuan --- vllm/config/lora.py | 5 ----- vllm/config/model.py | 25 ++----------------------- vllm/config/speculative.py | 7 ------- vllm/config/vllm.py | 34 +++++++--------------------------- 4 files changed, 9 insertions(+), 62 deletions(-) diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 2f9d638542b65..84e92eef40077 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -9,7 +9,6 @@ from pydantic import ConfigDict, Field, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self -import vllm.envs as envs from vllm.config.utils import config from vllm.logger import init_logger from vllm.platforms import current_platform @@ -106,10 +105,6 @@ class LoRAConfig: return self - def verify_with_cache_config(self, cache_config: CacheConfig): - if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1: - raise ValueError("V0 LoRA does not support CPU offload, please use V1.") - def verify_with_model_config(self, model_config: ModelConfig): if self.lora_dtype in (None, "auto"): self.lora_dtype = model_config.dtype diff --git a/vllm/config/model.py b/vllm/config/model.py index e22c218c769da..2151939d5a9f6 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -32,7 +32,6 @@ from vllm.transformers_utils.config import ( get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - is_interleaved, try_get_dense_modules, try_get_generation_config, try_get_safetensors_metadata, @@ -442,15 +441,12 @@ class ModelConfig: self.enforce_eager = True # Set the default seed to 0 in V1. - # NOTE(woosuk): In V0, we set the default seed to None because the - # driver worker shares the same process as the user process, and thus - # setting a seed affects the user process as well. - # In V1, we use separate processes for workers (unless + # NOTE(woosuk): In V1, we use separate processes for workers (unless # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here # doesn't affect the user process. However, without a consistent seed, # different tensor parallel workers would sample different tokens, # leading to inconsistent results. - if envs.VLLM_USE_V1 and self.seed is None: + if self.seed is None: self.seed = 0 if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: logger.warning( @@ -703,23 +699,6 @@ class ModelConfig: revision=self.revision, ) - # Interleaved attention is not supported by some backends in V0 - if ( - not self.disable_sliding_window - and is_interleaved(self.hf_text_config) - and not envs.VLLM_USE_V1 - and (backend := envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER") - ): - logger.warning_once( - "%s has interleaved attention, which is currently not " - "supported by the %s backend. Disabling sliding window and " - "capping the max length to the sliding window size (%d).", - self.hf_text_config.model_type, - backend, - self.hf_text_config.sliding_window, - ) - self.disable_sliding_window = True - self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) # Init multimodal config if needed diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 4c7b7369ed4b5..903b9a26fab88 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -9,7 +9,6 @@ from pydantic import Field, SkipValidation, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self -import vllm.envs as envs from vllm.config.parallel import ParallelConfig from vllm.config.utils import config from vllm.logger import init_logger @@ -366,12 +365,6 @@ class SpeculativeConfig: # Replace hf_config for EAGLE draft_model if self.method in ("eagle", "eagle3"): - if self.enable_chunked_prefill and not envs.VLLM_USE_V1: - raise ValueError( - "Chunked prefill and EAGLE are not compatible " - "when using V0." - ) - from vllm.transformers_utils.configs import SpeculatorsConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index c46f409edab61..f592a708a02b5 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -130,7 +130,6 @@ class VllmConfig: from vllm import __version__ vllm_factors.append(__version__) - vllm_factors.append(envs.VLLM_USE_V1) if self.model_config: vllm_factors.append(self.model_config.compute_hash()) else: @@ -306,7 +305,6 @@ class VllmConfig: self.cache_config.verify_with_parallel_config(self.parallel_config) if self.lora_config is not None: - self.lora_config.verify_with_cache_config(self.cache_config) self.lora_config.verify_with_model_config(self.model_config) if self.quant_config is None and self.model_config is not None: @@ -332,18 +330,9 @@ class VllmConfig: # we use the default mode. The default mode depends on other # settings (see the below code). if self.compilation_config.mode is None: - if envs.VLLM_USE_V1: - if ( - self.model_config is not None - and not self.model_config.enforce_eager - ): - self.compilation_config.mode = CompilationMode.VLLM_COMPILE - else: - self.compilation_config.mode = CompilationMode.NONE - + if self.model_config is not None and not self.model_config.enforce_eager: + self.compilation_config.mode = CompilationMode.VLLM_COMPILE else: - # NB: Passing both --enforce-eager and a compilation mode - # in V0 means the compilation mode wins out. self.compilation_config.mode = CompilationMode.NONE else: assert self.compilation_config.mode >= CompilationMode.NONE @@ -371,10 +360,7 @@ class VllmConfig: # if cudagraph_mode is not explicitly set by users, set default # value if self.compilation_config.cudagraph_mode is None: - if ( - envs.VLLM_USE_V1 - and self.compilation_config.mode == CompilationMode.VLLM_COMPILE - ): + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: # default to full and piecewise for most models self.compilation_config.cudagraph_mode = ( CUDAGraphMode.FULL_AND_PIECEWISE @@ -428,7 +414,7 @@ class VllmConfig: # override related settings when enforce eager self.compilation_config.max_cudagraph_capture_size = 0 self.compilation_config.cudagraph_capture_sizes = [] - elif envs.VLLM_USE_V1: + else: self.compilation_config.cudagraph_num_of_warmups = 1 self._set_cudagraph_sizes() @@ -535,14 +521,11 @@ class VllmConfig: current_platform.check_and_update_config(self) # Do this after all the updates to compilation_config.mode - if ( - envs.VLLM_USE_V1 - and self.compilation_config.mode == CompilationMode.VLLM_COMPILE - ): + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: self.compilation_config.set_splitting_ops_for_v1() # final check of cudagraph mode after all possible updates - if envs.VLLM_USE_V1 and current_platform.is_cuda_alike(): + if current_platform.is_cuda_alike(): if ( self.compilation_config.cudagraph_mode.has_full_cudagraphs() and self.model_config is not None @@ -587,10 +570,7 @@ class VllmConfig: if not self.instance_id: self.instance_id = random_uuid()[:5] - if ( - envs.VLLM_USE_V1 - and not self.scheduler_config.disable_hybrid_kv_cache_manager - ): + if not self.scheduler_config.disable_hybrid_kv_cache_manager: # logger should only print warning message for hybrid models. As we # can't know whether the model is hybrid or not now, so we don't log # warning message here and will log it later. From c7d2a554baf8694503e6865b5df300650b6c6b6b Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Thu, 30 Oct 2025 03:13:03 -0700 Subject: [PATCH 016/976] [CI Failure] fix test_default_mm_loras (#27795) Signed-off-by: Huamin Li <3ericli@gmail.com> --- tests/lora/test_default_mm_loras.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py index 1a5b9ba3641d3..dfc45e78e464f 100644 --- a/tests/lora/test_default_mm_loras.py +++ b/tests/lora/test_default_mm_loras.py @@ -30,7 +30,8 @@ VLLM_RUNNER_BASE_KWARGS = { "enable_lora": "True", "max_num_seqs": 2, "max_lora_rank": 320, - "max_model_len": 12800, + # Keep these LoRA tests on short-RoPE for determinism post-LongRoPE change. + "max_model_len": 4096, "gpu_memory_utilization": 0.8, "limit_mm_per_prompt": {"audio": 1}, "enforce_eager": True, From c01f6e525f457133cfb00127a89c09e5247e563c Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 30 Oct 2025 07:32:17 -0400 Subject: [PATCH 017/976] [CI] Fix mypy for `vllm/v1/core` and `vllm/v1/engine` (#27108) Signed-off-by: yewentao256 --- tools/pre_commit/mypy.py | 14 +++++++++++- vllm/config/vllm.py | 9 ++++---- vllm/engine/protocol.py | 1 + vllm/v1/core/sched/scheduler.py | 16 ++++++++------ vllm/v1/engine/async_llm.py | 21 +++++++++++------- vllm/v1/engine/core.py | 1 + vllm/v1/engine/core_client.py | 14 ++++++------ vllm/v1/engine/detokenizer.py | 13 +++++++++--- vllm/v1/engine/llm_engine.py | 16 ++++++++------ vllm/v1/engine/output_processor.py | 10 +++++++-- vllm/v1/engine/parallel_sampling.py | 4 ++-- vllm/v1/engine/processor.py | 33 ++++++++++++----------------- 12 files changed, 91 insertions(+), 61 deletions(-) diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index a3aa546347255..8d04848f8f780 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -36,12 +36,15 @@ FILES = [ "vllm/transformers_utils", "vllm/triton_utils", "vllm/usage", + "vllm/v1/core", + "vllm/v1/engine", ] # After fixing errors resulting from changing follow_imports # from "skip" to "silent", move the following directories to FILES SEPARATE_GROUPS = [ "tests", + # v0 related "vllm/attention", "vllm/compilation", "vllm/engine", @@ -50,7 +53,16 @@ SEPARATE_GROUPS = [ "vllm/model_executor", "vllm/plugins", "vllm/worker", - "vllm/v1", + # v1 related + "vllm/v1/attention", + "vllm/v1/executor", + "vllm/v1/kv_offload", + "vllm/v1/metrics", + "vllm/v1/pool", + "vllm/v1/sample", + "vllm/v1/spec_decode", + "vllm/v1/structured_output", + "vllm/v1/worker", ] # TODO(woosuk): Include the code from Megatron and HuggingFace. diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index f592a708a02b5..1acac70c32b03 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -84,7 +84,9 @@ class VllmConfig: default_factory=StructuredOutputsConfig ) """Structured outputs configuration.""" - observability_config: ObservabilityConfig | None = None + observability_config: ObservabilityConfig = Field( + default_factory=ObservabilityConfig + ) """Observability configuration.""" quant_config: QuantizationConfig | None = None """Quantization configuration.""" @@ -170,10 +172,7 @@ class VllmConfig: vllm_factors.append(self.structured_outputs_config.compute_hash()) else: vllm_factors.append("None") - if self.observability_config: - vllm_factors.append(self.observability_config.compute_hash()) - else: - vllm_factors.append("None") + vllm_factors.append(self.observability_config.compute_hash()) if self.quant_config: pass # should be captured by model_config.quantization if self.compilation_config: diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 959a0342817c2..24fcd9fe1cab9 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -77,6 +77,7 @@ class EngineClient(ABC): lora_request: LoRARequest | None = None, trace_headers: Mapping[str, str] | None = None, priority: int = 0, + truncate_prompt_tokens: int | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: """Generate outputs for a request from a pooling model.""" diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c794886bc24c8..ad6fbee2ec083 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -167,7 +167,7 @@ class Scheduler(SchedulerInterface): self.kv_cache_manager = KVCacheManager( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, - enable_caching=self.cache_config.enable_prefix_caching, + enable_caching=bool(self.cache_config.enable_prefix_caching), use_eagle=self.use_eagle, log_stats=self.log_stats, enable_kv_cache_events=self.enable_kv_cache_events, @@ -407,13 +407,13 @@ class Scheduler(SchedulerInterface): # Get externally-cached tokens if using a KVConnector. if self.connector is not None: - num_external_computed_tokens, load_kv_async = ( + ext_tokens, load_kv_async = ( self.connector.get_num_new_matched_tokens( request, num_new_local_computed_tokens ) ) - if num_external_computed_tokens is None: + if ext_tokens is None: # The request cannot be scheduled because # the KVConnector couldn't determine # the number of matched tokens. @@ -421,6 +421,8 @@ class Scheduler(SchedulerInterface): skipped_waiting_requests.prepend_request(request) continue + num_external_computed_tokens = ext_tokens + # Total computed tokens (local + external). num_computed_tokens = ( num_new_local_computed_tokens + num_external_computed_tokens @@ -905,13 +907,13 @@ class Scheduler(SchedulerInterface): outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) spec_decoding_stats: SpecDecodingStats | None = None - kv_connector_stats = ( + kv_connector_stats: KVConnectorStats | None = ( kv_connector_output.kv_connector_stats if kv_connector_output else None ) if kv_connector_stats and self.connector: - stats = self.connector.get_kv_connector_stats() - if stats: - kv_connector_stats = kv_connector_stats.aggregate(stats) + kv_stats = self.connector.get_kv_connector_stats() + if kv_stats: + kv_connector_stats = kv_connector_stats.aggregate(kv_stats) failed_kv_load_req_ids = None if kv_connector_output and kv_connector_output.invalid_block_ids: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 761c37504d80a..dc61d45015682 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -6,7 +6,7 @@ import socket import time from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy -from typing import Any +from typing import Any, cast import numpy as np import torch @@ -131,10 +131,9 @@ class AsyncLLM(EngineClient): self.output_processor = OutputProcessor( self.tokenizer, log_stats=self.log_stats ) - if self.observability_config.otlp_traces_endpoint is not None: - tracer = init_tracer( - "vllm.llm_engine", self.observability_config.otlp_traces_endpoint - ) + endpoint = self.observability_config.otlp_traces_endpoint + if endpoint is not None: + tracer = init_tracer("vllm.llm_engine", endpoint) self.output_processor.tracer = tracer # EngineCore (starts the engine in background process). @@ -266,7 +265,9 @@ class AsyncLLM(EngineClient): if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() - cancel_task_threadsafe(getattr(self, "output_handler", None)) + handler = getattr(self, "output_handler", None) + if handler is not None: + cancel_task_threadsafe(handler) async def get_supported_tasks(self) -> tuple[SupportedTask, ...]: return await self.engine_core.get_supported_tasks_async() @@ -314,7 +315,10 @@ class AsyncLLM(EngineClient): priority, data_parallel_rank, ) - prompt_text = prompt if isinstance(prompt, str) else prompt.get("prompt") + if isinstance(prompt, str): + prompt_text = prompt + elif isinstance(prompt, Mapping): + prompt_text = cast(str | None, prompt.get("prompt")) if is_pooling or params.n == 1: await self._add_request(request, prompt_text, None, 0, queue) @@ -436,6 +440,7 @@ class AsyncLLM(EngineClient): # Note: both OutputProcessor and EngineCore handle their # own request cleanup based on finished. finished = out.finished + assert isinstance(out, RequestOutput) yield out # If the request is disconnected by the client, generate() @@ -653,7 +658,7 @@ class AsyncLLM(EngineClient): return self.tokenizer async def is_tracing_enabled(self) -> bool: - return self.observability_config.otlp_traces_endpoint is not None + return self.observability_config.otlp_traces_endpoint is not None # type: ignore async def do_log_stats(self) -> None: if self.logger_manager: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 85cab32ebfb85..6cbd986b3cd32 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1075,6 +1075,7 @@ class DPEngineCoreProc(EngineCoreProc): local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local assert dp_size > 1 + assert local_dp_rank is not None assert 0 <= local_dp_rank <= dp_rank < dp_size if vllm_config.kv_transfer_config is not None: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 7b554ca991b9b..9b440505bd9dc 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -385,10 +385,11 @@ class BackgroundResources: with contextlib.suppress(Exception): task.cancel() - if in_loop(loop): - close_sockets_and_tasks() - elif loop and not loop.is_closed(): - loop.call_soon_threadsafe(close_sockets_and_tasks) + if loop is not None: + if in_loop(loop): + close_sockets_and_tasks() + elif not loop.is_closed(): + loop.call_soon_threadsafe(close_sockets_and_tasks) else: # Loop has been closed, try to clean up directly. del tasks @@ -1044,6 +1045,7 @@ class DPAsyncMPClient(AsyncMPClient): return assert self.stats_update_address is not None + stats_addr: str = self.stats_update_address assert len(self.engine_ranks_managed) > 0 # NOTE: running and waiting counts are all global from # the Coordinator include all global EngineCores. This @@ -1054,9 +1056,7 @@ class DPAsyncMPClient(AsyncMPClient): async def run_engine_stats_update_task(): with ( - make_zmq_socket( - self.ctx, self.stats_update_address, zmq.XSUB, linger=0 - ) as socket, + make_zmq_socket(self.ctx, stats_addr, zmq.XSUB, linger=0) as socket, make_zmq_socket( self.ctx, self.first_req_sock_addr, zmq.PAIR, bind=False, linger=0 ) as first_req_rcv_socket, diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 5f66e36893bf3..b7a24096bf15f 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -69,14 +69,21 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): # Stop strings params = request.sampling_params assert params is not None - self.stop = stop = params.stop + stop_list: list[str] + if params.stop is None: + stop_list = [] + elif isinstance(params.stop, str): + stop_list = [params.stop] + else: + stop_list = params.stop + self.stop = stop_list self.min_tokens = params.min_tokens self.include_stop_str_in_output = params.include_stop_str_in_output # Number of chars to hold back when stop strings are to be excluded # from streamed output. - if stop and not self.include_stop_str_in_output: - self.stop_buffer_length = max(len(s) for s in stop) - 1 + if self.stop and not self.include_stop_str_in_output: + self.stop_buffer_length = max(len(s) for s in self.stop) - 1 else: self.stop_buffer_length = 0 self._last_output_text_offset: int = 0 diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 0fce343702e0a..c2ca9579d55ea 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -4,7 +4,7 @@ import time from collections.abc import Callable, Mapping from copy import copy -from typing import Any +from typing import Any, cast import torch.nn as nn from typing_extensions import TypeVar @@ -112,10 +112,9 @@ class LLMEngine: self.output_processor = OutputProcessor( self.tokenizer, log_stats=self.log_stats ) - if self.observability_config.otlp_traces_endpoint is not None: - tracer = init_tracer( - "vllm.llm_engine", self.observability_config.otlp_traces_endpoint - ) + endpoint = self.observability_config.otlp_traces_endpoint + if endpoint is not None: + tracer = init_tracer("vllm.llm_engine", endpoint) self.output_processor.tracer = tracer # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) @@ -259,7 +258,10 @@ class LLMEngine: trace_headers, priority, ) - prompt_text = prompt if isinstance(prompt, str) else prompt.get("prompt") + if isinstance(prompt, str): + prompt_text = prompt + elif isinstance(prompt, Mapping): + prompt_text = cast(str | None, prompt.get("prompt")) n = params.n if isinstance(params, SamplingParams) else 1 @@ -285,7 +287,7 @@ class LLMEngine: # Add the request to EngineCore. self.engine_core.add_request(child_request) - def step(self) -> list[RequestOutput] | list[PoolingRequestOutput]: + def step(self) -> list[RequestOutput | PoolingRequestOutput]: if self.should_execute_dummy_batch: self.should_execute_dummy_batch = False self.engine_core.execute_dummy_batch() diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 44e4eadce42ac..07c8113dd9b33 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -44,10 +44,16 @@ class RequestOutputCollector: if self.output is None or isinstance(output, Exception): self.output = output self.ready.set() - elif isinstance(self.output, (RequestOutput, PoolingRequestOutput)): + elif isinstance(self.output, RequestOutput) and isinstance( + output, RequestOutput + ): # This ensures that request outputs with different request indexes # (if n > 1) do not override each other. self.output.add(output, aggregate=self.aggregate) + elif isinstance(self.output, PoolingRequestOutput) and isinstance( + output, PoolingRequestOutput + ): + self.output = output async def get(self) -> RequestOutput | PoolingRequestOutput: """Get operation blocks on put event.""" @@ -408,7 +414,7 @@ class OutputProcessor: within the loop below. """ - request_outputs: list[RequestOutput] | list[PoolingRequestOutput] = [] + request_outputs: list[RequestOutput | PoolingRequestOutput] = [] reqs_to_abort: list[str] = [] for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py index 2a47befec25f1..26ee10d2b9bbf 100644 --- a/vllm/v1/engine/parallel_sampling.py +++ b/vllm/v1/engine/parallel_sampling.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import copy -from typing import Optional +from typing import Optional, cast from vllm.outputs import CompletionOutput from vllm.sampling_params import RequestOutputKind, SamplingParams @@ -37,7 +37,7 @@ class ParentRequest: self.child_requests = set() self.output_aggregator = ( - [None] * sampling_params.n + [cast(CompletionOutput, None)] * sampling_params.n if (sampling_params.output_kind == RequestOutputKind.FINAL_ONLY) else [] ) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index de15677aeea91..c49fd1bde8b98 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -3,7 +3,7 @@ import time from collections.abc import Mapping -from typing import Any, Literal +from typing import Any, Literal, cast from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs @@ -208,9 +208,9 @@ class Processor: enc = prompt.get("encoder_prompt") dec = prompt.get("decoder_prompt") if enc is not None: - _validate_single_prompt(enc) + _validate_single_prompt(cast(dict | str, enc)) if dec is not None: - _validate_single_prompt(dec) + _validate_single_prompt(cast(dict | str, dec)) else: _validate_single_prompt(prompt) # type: ignore[arg-type] @@ -332,7 +332,7 @@ class Processor: if not mm_data: return None - mm_uuids: MultiModalUUIDDict = {} + mm_uuids: dict[str, list[str | None] | str] = {} for modality, data in mm_data.items(): n = len(data) if isinstance(data, list) else 1 mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)] @@ -384,7 +384,9 @@ class Processor: # if provided. self._validate_multi_modal_uuids(prompt) if isinstance(prompt, dict): - mm_uuids = prompt.get("multi_modal_uuids") + mm_uuids = cast( + MultiModalUUIDDict | None, prompt.get("multi_modal_uuids") + ) else: mm_uuids = None @@ -410,20 +412,13 @@ class Processor: encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) self._validate_model_inputs(encoder_inputs, decoder_inputs) - # Mypy does not always properly infer the types of some elements of - # discriminated unions of TypedDicts, because of how it handles - # inheritance of TypedDict. If we explicitly extract the items we want - # we can avoid type errors from using `dict.get` later in the method. - prompt_token_ids = ( - decoder_inputs["prompt_token_ids"] - if decoder_inputs["type"] != "embeds" - else None - ) - prompt_embeds = ( - decoder_inputs["prompt_embeds"] - if decoder_inputs["type"] == "embeds" - else None - ) + # Mypy can be conservative for TypedDict unions; normalize access. + if decoder_inputs["type"] == "embeds": + prompt_token_ids = None + prompt_embeds = decoder_inputs["prompt_embeds"] + else: + prompt_token_ids = decoder_inputs["prompt_token_ids"] + prompt_embeds = None sampling_params = None pooling_params = None From 74374386e27f9e7a056a37960d5e996093e45ac4 Mon Sep 17 00:00:00 2001 From: Sairam Pillai Date: Thu, 30 Oct 2025 17:27:59 +0530 Subject: [PATCH 018/976] [Bugfix] Improve GPU validation logging in Ray fallback scenarios (#25775) Signed-off-by: Sairam Pillai --- vllm/config/parallel.py | 14 ++++------ vllm/v1/executor/ray_utils.py | 50 ++++++++++++++++++++++++++++++----- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index e8847354bb092..82d575f24690d 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -521,15 +521,11 @@ class ParallelConfig: current_platform.is_cuda() and cuda_device_count_stateless() < self.world_size ): - if not ray_found: - raise ValueError( - "Unable to load Ray: " - f"{ray_utils.ray_import_err}. Ray is " - "required for multi-node inference, " - "please install Ray with `pip install " - "ray`." - ) - backend = "ray" + gpu_count = cuda_device_count_stateless() + raise ValueError( + f"Tensor parallel size ({self.world_size}) cannot be " + f"larger than the number of available GPUs ({gpu_count})." + ) elif self.data_parallel_backend == "ray": logger.info( "Using ray distributed inference because " diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py index 518f1582faeb0..382f008266e62 100644 --- a/vllm/v1/executor/ray_utils.py +++ b/vllm/v1/executor/ray_utils.py @@ -255,12 +255,33 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): try: ray.get(pg_ready_ref, timeout=0) except ray.exceptions.GetTimeoutError: - raise ValueError( - "Cannot provide a placement group of " - f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " - "`ray status` and `ray list nodes` to make sure the cluster has " - "enough resources." - ) from None + # Provide more helpful error message when GPU count is exceeded + total_gpu_required = sum(spec.get("GPU", 0) for spec in placement_group_specs) + # If more than one GPU is required for the placement group, provide a + # more specific error message. + # We use >1 here because multi-GPU (tensor parallel) jobs are more + # likely to fail due to insufficient cluster resources, and users may + # need to adjust tensor_parallel_size to fit available GPUs. + if total_gpu_required > 1: + raise ValueError( + f"Cannot provide a placement group requiring " + f"{total_gpu_required} GPUs " + f"(placement_group_specs={placement_group_specs}) within " + f"{PG_WAIT_TIMEOUT} seconds.\n" + f"Tensor parallel size may exceed available GPUs in your " + f"cluster. Check resources with `ray status` and " + f"`ray list nodes`.\n" + f"If running on K8s with limited GPUs, consider reducing " + f"--tensor-parallel-size to match available GPU resources." + ) from None + else: + raise ValueError( + "Cannot provide a placement group of " + f"{placement_group_specs=} within " + f"{PG_WAIT_TIMEOUT} seconds. See " + "`ray status` and `ray list nodes` to make sure the cluster " + "has enough resources." + ) from None def _wait_until_pg_removed(current_placement_group: "PlacementGroup"): @@ -299,6 +320,23 @@ def initialize_ray_cluster( assert_ray_available() from vllm.platforms import current_platform + # Prevalidate GPU requirements before Ray processing + if current_platform.is_cuda() and parallel_config.world_size > 1: + from vllm.utils import cuda_device_count_stateless + + available_gpus = cuda_device_count_stateless() + if parallel_config.world_size > available_gpus: + logger.warning( + "Tensor parallel size (%d) exceeds available GPUs (%d). " + "This may result in Ray placement group allocation failures. " + "Consider reducing tensor_parallel_size to %d or less, " + "or ensure your Ray cluster has %d GPUs available.", + parallel_config.world_size, + available_gpus, + available_gpus, + parallel_config.world_size, + ) + if ray.is_initialized(): logger.info("Ray is already initialized. Skipping Ray initialization.") elif current_platform.is_rocm() or current_platform.is_xpu(): From 4464723f220a74785cd1971cf62a04e3961c2846 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Thu, 30 Oct 2025 20:13:05 +0800 Subject: [PATCH 019/976] [Frontend][Doc][5/N] Improve all pooling task | Polish encode (pooling) api & Document. (#25524) Signed-off-by: wang.yuqi Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung --- docs/design/io_processor_plugins.md | 2 +- docs/models/pooling_models.md | 83 +++++++++++--- docs/serving/openai_compatible_server.md | 4 +- examples/offline_inference/pooling/README.md | 12 ++ examples/offline_inference/pooling/ner.py | 2 +- .../{ => pooling}/prithvi_geospatial_mae.py | 0 .../prithvi_geospatial_mae_io_processor.py | 0 examples/online_serving/pooling/README.md | 40 ++++++- .../openai_cross_encoder_score.py | 0 ...enai_cross_encoder_score_for_multimodal.py | 0 .../{ => pooling}/prithvi_geospatial_mae.py | 0 .../entrypoints/pooling/llm/test_classify.py | 12 +- tests/entrypoints/pooling/llm/test_reward.py | 12 +- tests/entrypoints/pooling/llm/test_score.py | 10 +- .../pooling/openai/test_classification.py | 92 +++++++++++---- .../pooling/openai/test_embedding.py | 53 ++++++++- .../entrypoints/pooling/openai/test_rerank.py | 53 +++++++-- .../entrypoints/pooling/openai/test_score.py | 16 +-- .../test_pooler_config_init_behaviour.py | 8 +- tests/test_pooling_params.py | 14 +-- vllm/config/pooler.py | 38 +++++- vllm/entrypoints/openai/api_server.py | 8 +- vllm/entrypoints/openai/protocol.py | 108 ++++++++++++++++-- vllm/entrypoints/openai/serving_pooling.py | 23 ++-- vllm/model_executor/layers/pooler.py | 4 +- vllm/model_executor/models/config.py | 4 +- vllm/pooling_params.py | 32 +++--- 27 files changed, 499 insertions(+), 131 deletions(-) rename examples/offline_inference/{ => pooling}/prithvi_geospatial_mae.py (100%) rename examples/offline_inference/{ => pooling}/prithvi_geospatial_mae_io_processor.py (100%) rename examples/online_serving/{ => pooling}/openai_cross_encoder_score.py (100%) rename examples/online_serving/{ => pooling}/openai_cross_encoder_score_for_multimodal.py (100%) rename examples/online_serving/{ => pooling}/prithvi_geospatial_mae.py (100%) diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index fb64a7bb9c8f1..2f4b17f191a5d 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters. The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py). -An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/prithvi_geospatial_mae.py](../../examples/online_serving/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py)) inference examples. +An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples. ## Using an IO Processor plugin diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 40651be1d4495..18bb645ea9a9c 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -30,11 +30,11 @@ If `--runner pooling` has been set (manually or automatically) but the model doe vLLM will attempt to automatically convert the model according to the architecture names shown in the table below. -| Architecture | `--convert` | Supported pooling tasks | -|-------------------------------------------------|-------------|-------------------------------| -| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `encode`, `embed` | -| `*For*Classification`, `*ClassificationModel` | `classify` | `encode`, `classify`, `score` | -| `*ForRewardModeling`, `*RewardModel` | `reward` | `encode` | +| Architecture | `--convert` | Supported pooling tasks | +|-------------------------------------------------|-------------|---------------------------------------| +| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` | +| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` | +| `*ForRewardModeling`, `*RewardModel` | `reward` | `token_classify` | !!! tip You can explicitly set `--convert ` to specify how to convert the model. @@ -45,12 +45,14 @@ Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.pooler.Pooler.get_supported_tasks], enabling the corresponding APIs: -| Task | APIs | -|------------|--------------------------------------| -| `encode` | `LLM.reward(...)` | -| `embed` | `LLM.embed(...)`, `LLM.score(...)`\* | -| `classify` | `LLM.classify(...)` | -| `score` | `LLM.score(...)` | +| Task | APIs | +|------------------|-------------------------------------------------------------------------------| +| `embed` | `LLM.embed(...)`, `LLM.score(...)`\*, `LLM.encode(..., pooling_task="embed")` | +| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")` | +| `score` | `LLM.score(...)` | +| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` | +| `token_embed` | `LLM.encode(..., pooling_task="token_embed")` | +| `plugin` | `LLM.encode(..., pooling_task="plugin")` | \* The `LLM.score(...)` API falls back to `embed` task if the model does not support `score` task. @@ -144,7 +146,6 @@ A code example can be found here: [examples/offline_inference/basic/score.py](.. ### `LLM.reward` The [reward][vllm.LLM.reward] method is available to all reward models in vLLM. -It returns the extracted hidden states directly. ```python from vllm import LLM @@ -161,15 +162,17 @@ A code example can be found here: [examples/offline_inference/basic/reward.py](. ### `LLM.encode` The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM. -It returns the extracted hidden states directly. !!! note Please use one of the more specific methods or set the task directly when using `LLM.encode`: - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`. - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`. - - For rewards, use `LLM.reward(...)` or `pooling_task="reward"`. - For similarity scores, use `LLM.score(...)`. + - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`. + - For token classification, use `pooling_task="token_classify"`. + - For multi-vector retrieval, use `pooling_task="token_embed"` + - For IO Processor Plugins , use `pooling_task="plugin"` ```python from vllm import LLM @@ -185,10 +188,47 @@ print(f"Data: {data!r}") Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: -- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. - [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](../features/multimodal_inputs.md) for embedding models. - [Classification API](../serving/openai_compatible_server.md#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models. - [Score API](../serving/openai_compatible_server.md#score-api) is similar to `LLM.score` for cross-encoder models. +- [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models. + +!!! note + Please use one of the more specific methods or set the task directly when using [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.: + + - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`. + - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`. + - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api). + - For rewards, `task":"token_classify"`. + - For token classification, use `task":"token_classify"`. + - For multi-vector retrieval, use `task":"token_embed"` + - For IO Processor Plugins , use `task":"plugin"` + +```python +# start a supported embeddings model server with `vllm serve`, e.g. +# vllm serve intfloat/e5-small +import requests + +host = "localhost" +port = "8000" +model_name = "intfloat/e5-small" + +api_url = f"http://{host}:{port}/pooling" + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +prompt = {"model": model_name, "input": prompts, "task": "embed"} + +response = requests.post(api_url, json=prompt) + +for output in response.json()["data"]: + data = output["data"] + print(f"Data: {data!r} (size={len(data)})") +``` ## Matryoshka Embeddings @@ -265,3 +305,16 @@ Expected output: ``` An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py) + +## Deprecated Features + +### Encode task + +We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`: + +- `token_embed` is the same as embed, using normalize as activation. +- `token_classify` is the same as classify, default using softmax as activation. + +### Remove softmax from PoolingParams + +We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function. diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 1414718a697d5..e331b3422ea64 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -638,7 +638,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -Code example: [examples/online_serving/openai_cross_encoder_score.py](../../examples/online_serving/openai_cross_encoder_score.py) +Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py) #### Single inference @@ -819,7 +819,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including print("Scoring output:", response_json["data"][0]["score"]) print("Scoring output:", response_json["data"][1]["score"]) ``` -Full example: [examples/online_serving/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/openai_cross_encoder_score_for_multimodal.py) +Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py) #### Extra parameters diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md index cd9717122b16b..ad78be38716b6 100644 --- a/examples/offline_inference/pooling/README.md +++ b/examples/offline_inference/pooling/README.md @@ -38,6 +38,18 @@ python examples/offline_inference/pooling/multi_vector_retrieval.py python examples/offline_inference/pooling/ner.py ``` +## Prithvi Geospatial MAE usage + +```bash +python examples/offline_inference/pooling/prithvi_geospatial_mae.py +``` + +## IO Processor Plugins for Prithvi Geospatial MAE + +```bash +python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py +``` + ## Qwen3 reranker usage ```bash diff --git a/examples/offline_inference/pooling/ner.py b/examples/offline_inference/pooling/ner.py index b2dffdd6c5ee9..34c80e7ccffd3 100644 --- a/examples/offline_inference/pooling/ner.py +++ b/examples/offline_inference/pooling/ner.py @@ -33,7 +33,7 @@ def main(args: Namespace): label_map = llm.llm_engine.vllm_config.model_config.hf_config.id2label # Run inference - outputs = llm.encode(prompts) + outputs = llm.encode(prompts, pooling_task="token_classify") for prompt, output in zip(prompts, outputs): logits = output.outputs.data diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/pooling/prithvi_geospatial_mae.py similarity index 100% rename from examples/offline_inference/prithvi_geospatial_mae.py rename to examples/offline_inference/pooling/prithvi_geospatial_mae.py diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py similarity index 100% rename from examples/offline_inference/prithvi_geospatial_mae_io_processor.py rename to examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md index 3b6da20d5f0fe..b76ad21f04818 100644 --- a/examples/online_serving/pooling/README.md +++ b/examples/online_serving/pooling/README.md @@ -3,65 +3,95 @@ ## Cohere rerank usage ```bash +# vllm serve BAAI/bge-reranker-base python examples/online_serving/pooling/cohere_rerank_client.py ``` ## Embedding requests base64 encoding_format usage ```bash +# vllm serve intfloat/e5-small python examples/online_serving/pooling/embedding_requests_base64_client.py ``` ## Embedding requests bytes encoding_format usage ```bash +# vllm serve intfloat/e5-small python examples/online_serving/pooling/embedding_requests_bytes_client.py ``` ## Jinaai rerank usage ```bash +# vllm serve BAAI/bge-reranker-base python examples/online_serving/pooling/jinaai_rerank_client.py ``` ## Multi vector retrieval usage ```bash +# vllm serve BAAI/bge-m3 python examples/online_serving/pooling/multi_vector_retrieval_client.py ``` ## Named Entity Recognition (NER) usage ```bash +# vllm serve boltuix/NeuroBERT-NER python examples/online_serving/pooling/ner_client.py ``` -## Openai chat embedding for multimodal usage +## OpenAI chat embedding for multimodal usage ```bash python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py ``` -## Openai classification usage +## OpenAI classification usage ```bash +# vllm serve jason9693/Qwen2.5-1.5B-apeach python examples/online_serving/pooling/openai_classification_client.py ``` -## Openai embedding usage +## OpenAI cross_encoder score usage ```bash +# vllm serve BAAI/bge-reranker-v2-m3 +python examples/online_serving/pooling/openai_cross_encoder_score.py +``` + +## OpenAI cross_encoder score for multimodal usage + +```bash +# vllm serve jinaai/jina-reranker-m0 +python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py +``` + +## OpenAI embedding usage + +```bash +# vllm serve intfloat/e5-small python examples/online_serving/pooling/openai_embedding_client.py ``` -## Openai embedding matryoshka dimensions usage +## OpenAI embedding matryoshka dimensions usage ```bash +# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py ``` -## Openai pooling usage +## OpenAI pooling usage ```bash +# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code python examples/online_serving/pooling/openai_pooling_client.py ``` + +## Online Prithvi Geospatial MAE usage + +```bash +python examples/online_serving/pooling/prithvi_geospatial_mae.py +``` diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/pooling/openai_cross_encoder_score.py similarity index 100% rename from examples/online_serving/openai_cross_encoder_score.py rename to examples/online_serving/pooling/openai_cross_encoder_score.py diff --git a/examples/online_serving/openai_cross_encoder_score_for_multimodal.py b/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py similarity index 100% rename from examples/online_serving/openai_cross_encoder_score_for_multimodal.py rename to examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/pooling/prithvi_geospatial_mae.py similarity index 100% rename from examples/online_serving/prithvi_geospatial_mae.py rename to examples/online_serving/pooling/prithvi_geospatial_mae.py diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/llm/test_classify.py index 96f634ee0a8c7..1063c3b6b755c 100644 --- a/tests/entrypoints/pooling/llm/test_classify.py +++ b/tests/entrypoints/pooling/llm/test_classify.py @@ -37,15 +37,17 @@ def llm(): @pytest.mark.skip_global_cleanup def test_pooling_params(llm: LLM): - def get_outputs(activation): + def get_outputs(use_activation): outputs = llm.classify( - prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False + prompts, + pooling_params=PoolingParams(use_activation=use_activation), + use_tqdm=False, ) return torch.tensor([x.outputs.probs for x in outputs]) - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/llm/test_reward.py index 81058dbad891b..0255704cecd94 100644 --- a/tests/entrypoints/pooling/llm/test_reward.py +++ b/tests/entrypoints/pooling/llm/test_reward.py @@ -37,15 +37,17 @@ def llm(): def test_pooling_params(llm: LLM): - def get_outputs(activation): + def get_outputs(use_activation): outputs = llm.reward( - prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False + prompts, + pooling_params=PoolingParams(use_activation=use_activation), + use_tqdm=False, ) return torch.cat([x.outputs.data for x in outputs]) - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py index 2df973dd7863b..b69c6a47c1913 100644 --- a/tests/entrypoints/pooling/llm/test_score.py +++ b/tests/entrypoints/pooling/llm/test_score.py @@ -34,21 +34,21 @@ def llm(): def test_pooling_params(llm: LLM): - def get_outputs(activation): + def get_outputs(use_activation): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." outputs = llm.score( text_1, text_2, - pooling_params=PoolingParams(activation=activation), + pooling_params=PoolingParams(use_activation=use_activation), use_tqdm=False, ) return torch.tensor([x.outputs.score for x in outputs]) - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/openai/test_classification.py index 92d40efad21cb..671bb948780ae 100644 --- a/tests/entrypoints/pooling/openai/test_classification.py +++ b/tests/entrypoints/pooling/openai/test_classification.py @@ -7,7 +7,7 @@ import torch import torch.nn.functional as F from tests.utils import RemoteOpenAIServer -from vllm.entrypoints.openai.protocol import ClassificationResponse +from vllm.entrypoints.openai.protocol import ClassificationResponse, PoolingResponse MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach" DTYPE = "float32" # Use float32 to avoid NaN issue @@ -163,20 +163,24 @@ async def test_invocations(server: RemoteOpenAIServer): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_activation(server: RemoteOpenAIServer, model_name: str): +async def test_use_activation(server: RemoteOpenAIServer, model_name: str): input_text = ["This product was excellent and exceeded my expectations"] - async def get_outputs(activation): + async def get_outputs(use_activation): response = requests.post( server.url_for("classify"), - json={"model": model_name, "input": input_text, "activation": activation}, + json={ + "model": model_name, + "input": input_text, + "use_activation": use_activation, + }, ) outputs = response.json() return torch.tensor([x["probs"] for x in outputs["data"]]) - default = await get_outputs(activation=None) - w_activation = await get_outputs(activation=True) - wo_activation = await get_outputs(activation=False) + default = await get_outputs(use_activation=None) + w_activation = await get_outputs(use_activation=True) + wo_activation = await get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." @@ -191,18 +195,7 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_pooling(server: RemoteOpenAIServer, model_name: str): - # pooling api uses ALL pooling, which does not support chunked prefill. - response = requests.post( - server.url_for("pooling"), - json={"model": model_name, "input": "test", "encoding_format": "float"}, - ) - assert response.json()["error"]["type"] == "BadRequestError" - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_score(server: RemoteOpenAIServer, model_name: str): +async def test_score(server: RemoteOpenAIServer, model_name: str): # score api is only enabled for num_labels == 1. response = requests.post( server.url_for("score"), @@ -217,7 +210,7 @@ def test_score(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_rerank(server: RemoteOpenAIServer, model_name: str): +async def test_rerank(server: RemoteOpenAIServer, model_name: str): # rerank api is only enabled for num_labels == 1. response = requests.post( server.url_for("rerank"), @@ -228,3 +221,62 @@ def test_rerank(server: RemoteOpenAIServer, model_name: str): }, ) assert response.json()["error"]["type"] == "BadRequestError" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str): + input_text = "This product was excellent and exceeded my expectations" + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": "classify", + }, + ) + poolings = PoolingResponse.model_validate(response.json()) + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 2 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str): + # token_classify uses ALL pooling, which does not support chunked prefill. + task = "token_classify" + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"]) +async def test_pooling_not_supported( + server: RemoteOpenAIServer, model_name: str, task: str +): + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index b3f12283fdbdf..e971b23e8f1a0 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -562,12 +562,40 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_pooling(server: RemoteOpenAIServer, model_name: str): +async def test_pooling_embed(server: RemoteOpenAIServer, model_name: str): + task = "embed" input_text = ["The chef prepared a delicious meal."] response = requests.post( server.url_for("pooling"), - json={"model": model_name, "input": input_text, "encoding_format": "float"}, + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": task, + }, + ) + + poolings = PoolingResponse.model_validate(response.json()) + + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 384 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_token_embed(server: RemoteOpenAIServer, model_name: str): + task = "token_embed" + input_text = ["The chef prepared a delicious meal."] + + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": task, + }, ) poolings = PoolingResponse.model_validate(response.json()) @@ -575,3 +603,24 @@ async def test_pooling(server: RemoteOpenAIServer, model_name: str): assert len(poolings.data) == 1 assert len(poolings.data[0].data) == 11 assert len(poolings.data[0].data[0]) == 384 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("task", ["classify", "token_classify", "plugin"]) +async def test_pooling_not_supported( + server: RemoteOpenAIServer, model_name: str, task: str +): + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index e43148d25feeb..1d85190c12a19 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -125,8 +125,8 @@ def test_invocations(server: RemoteOpenAIServer): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_activation(server: RemoteOpenAIServer, model_name: str): - async def get_outputs(activation): +async def test_use_activation(server: RemoteOpenAIServer, model_name: str): + async def get_outputs(use_activation): query = "What is the capital of France?" documents = [ "The capital of Brazil is Brasilia.", @@ -139,16 +139,16 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str): "model": model_name, "query": query, "documents": documents, - "activation": activation, + "use_activation": use_activation, }, ) outputs = response.json() return torch.tensor([x["relevance_score"] for x in outputs["results"]]) - default = await get_outputs(activation=None) - w_activation = await get_outputs(activation=True) - wo_activation = await get_outputs(activation=False) + default = await get_outputs(use_activation=None) + w_activation = await get_outputs(use_activation=True) + wo_activation = await get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." @@ -163,7 +163,25 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_pooling(server: RemoteOpenAIServer, model_name: str): +async def test_pooling_classify(server: RemoteOpenAIServer, model_name: str): + input_text = "This product was excellent and exceeded my expectations" + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": input_text, + "encoding_format": "float", + "task": "classify", + }, + ) + poolings = PoolingResponse.model_validate(response.json()) + assert len(poolings.data) == 1 + assert len(poolings.data[0].data) == 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_pooling_token_classify(server: RemoteOpenAIServer, model_name: str): input_text = ["The chef prepared a delicious meal."] response = requests.post( @@ -176,3 +194,24 @@ async def test_pooling(server: RemoteOpenAIServer, model_name: str): assert len(poolings.data) == 1 assert len(poolings.data[0].data) == 11 assert len(poolings.data[0].data[0]) == 1 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"]) +async def test_pooling_not_supported( + server: RemoteOpenAIServer, model_name: str, task: str +): + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "input": "test", + "encoding_format": "float", + "task": task, + }, + ) + assert response.json()["error"]["type"] == "BadRequestError" + assert response.json()["error"]["message"].startswith( + f"Task {task} is not supported" + ) diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py index ef213ab0ea18b..b8f796d47efaa 100644 --- a/tests/entrypoints/pooling/openai/test_score.py +++ b/tests/entrypoints/pooling/openai/test_score.py @@ -218,8 +218,8 @@ class TestModel: # TODO: reset this tolerance to 0.01 once we find # an alternative to flash_attn with bfloat16 - def test_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]): - def get_outputs(activation): + def test_use_activation(self, server: RemoteOpenAIServer, model: dict[str, Any]): + def get_outputs(use_activation): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." response = requests.post( @@ -228,7 +228,7 @@ class TestModel: "model": model["name"], "text_1": text_1, "text_2": text_2, - "activation": activation, + "use_activation": use_activation, }, ) if response.status_code != 200: @@ -238,9 +238,9 @@ class TestModel: return torch.tensor([x["score"] for x in outputs["data"]]) if model["is_cross_encoder"]: - default = get_outputs(activation=None) - w_activation = get_outputs(activation=True) - wo_activation = get_outputs(activation=False) + default = get_outputs(use_activation=None) + w_activation = get_outputs(use_activation=True) + wo_activation = get_outputs(use_activation=False) assert torch.allclose(default, w_activation, atol=1e-2), ( "Default should use activation." @@ -252,8 +252,8 @@ class TestModel: "w_activation should be close to activation(wo_activation)." ) else: - get_outputs(activation=None) + get_outputs(use_activation=None) # The activation parameter only works for the is_cross_encoder model - response = get_outputs(activation=True) + response = get_outputs(use_activation=True) assert response.status_code == 400 diff --git a/tests/models/language/pooling/test_pooler_config_init_behaviour.py b/tests/models/language/pooling/test_pooler_config_init_behaviour.py index 55663ee3f1b41..deb5de984d909 100644 --- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py +++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py @@ -24,7 +24,7 @@ def test_classify_models_using_activation( model, max_model_len=512, dtype=dtype, - pooler_config=PoolerConfig(activation=False), + pooler_config=PoolerConfig(use_activation=False), ) as vllm_model: wo_activation_out = vllm_model.classify(example_prompts) @@ -32,7 +32,7 @@ def test_classify_models_using_activation( model, max_model_len=512, dtype=dtype, - pooler_config=PoolerConfig(activation=True), + pooler_config=PoolerConfig(use_activation=True), ) as vllm_model: w_activation_out = vllm_model.classify(example_prompts) @@ -104,7 +104,7 @@ def test_reward_models_using_activation( model, max_model_len=1024, dtype=dtype, - pooler_config=PoolerConfig(activation=False), + pooler_config=PoolerConfig(use_activation=False), ) as vllm_model: wo_activation = vllm_model.reward(example_prompts) @@ -112,7 +112,7 @@ def test_reward_models_using_activation( model, max_model_len=1024, dtype=dtype, - pooler_config=PoolerConfig(activation=True), + pooler_config=PoolerConfig(use_activation=True), ) as vllm_model: w_activation = vllm_model.reward(example_prompts) diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py index e73d7efc1483a..7812562c8948c 100644 --- a/tests/test_pooling_params.py +++ b/tests/test_pooling_params.py @@ -17,7 +17,7 @@ EMBEDDING_MODELS = [ ), ] -classify_parameters = ["activation"] +classify_parameters = ["use_activation"] embed_parameters = ["dimensions", "normalize"] step_pooling_parameters = ["step_tag_id", "returned_token_ids"] @@ -88,13 +88,13 @@ def test_embed_dimensions(model_info: EmbedModelInfo): def test_classify(task): model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS")) - pooling_params = PoolingParams(activation=None) + pooling_params = PoolingParams(use_activation=None) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=True) + pooling_params = PoolingParams(use_activation=True) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=False) + pooling_params = PoolingParams(use_activation=False) pooling_params.verify(task=task, model_config=model_config) invalid_parameters = embed_parameters + step_pooling_parameters @@ -137,13 +137,13 @@ def test_token_classify(pooling_type: str): pooler_config=PoolerConfig(pooling_type=pooling_type) ) - pooling_params = PoolingParams(activation=None) + pooling_params = PoolingParams(use_activation=None) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=True) + pooling_params = PoolingParams(use_activation=True) pooling_params.verify(task=task, model_config=model_config) - pooling_params = PoolingParams(activation=False) + pooling_params = PoolingParams(use_activation=False) pooling_params.verify(task=task, model_config=model_config) invalid_parameters = embed_parameters diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index 0590f74aa4c93..6bece8d0785bd 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -7,6 +7,9 @@ from typing import Any from pydantic.dataclasses import dataclass from vllm.config.utils import config +from vllm.logger import init_logger + +logger = init_logger(__name__) @config @@ -48,7 +51,15 @@ class PoolerConfig: """ ## for classification models - activation: bool | None = None + softmax: float | None = None + """ + softmax will be deprecated, please use use_activation instead. + """ + activation: float | None = None + """ + activation will be deprecated, please use use_activation instead. + """ + use_activation: bool | None = None """ Whether to apply activation function to the classification outputs. Defaults to True. @@ -59,11 +70,6 @@ class PoolerConfig: """ ## for reward models - softmax: bool | None = None - """ - Whether to apply softmax to the reward outputs. - Defaults to True. - """ step_tag_id: int | None = None """ If set, only the score corresponding to the `step_tag_id` in the @@ -77,6 +83,10 @@ class PoolerConfig: `math-shepherd-mistral-7b-prm` model. """ + def __post_init__(self): + # raise deprecated warning for softmax and activation + self.use_activation = get_use_activation(self) + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -94,3 +104,19 @@ class PoolerConfig: factors: list[Any] = [] hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str + + +def get_use_activation(o: object): + if softmax := getattr(o, "softmax", None) is not None: + logger.warning_once( + "softmax will be deprecated, please use use_activation instead." + ) + return softmax + + if activation := getattr(o, "activation", None) is not None: + logger.warning_once( + "activation will be deprecated, please use use_activation instead." + ) + return activation + + return getattr(o, "use_activation", None) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 71939d6c41dfa..f3aa5351e5302 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -107,6 +107,7 @@ from vllm.entrypoints.utils import ( ) from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager +from vllm.tasks import POOLING_TASKS from vllm.usage.usage_lib import UsageContext from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.network_utils import is_valid_ipv6_address @@ -1748,12 +1749,7 @@ async def init_app_state( log_error_stack=args.log_error_stack, ) ) - if ( - any( - task in supported_tasks - for task in ["token_embed", "token_classify", "plugin"] - ) - ) + if any(task in POOLING_TASKS for task in supported_tasks) else None ) state.openai_serving_embedding = ( diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 0778e4d787905..d0061f9d5b40f 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -49,6 +49,8 @@ from openai.types.responses.response_reasoning_item import ( ) from openai_harmony import Message as OpenAIHarmonyMessage +from vllm.config.pooler import get_use_activation +from vllm.tasks import PoolingTask from vllm.utils.serial_utils import ( EmbedDType, EncodingFormat, @@ -1669,8 +1671,58 @@ class EmbeddingChatRequest(OpenAIBaseModel): EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest -PoolingCompletionRequest = EmbeddingCompletionRequest -PoolingChatRequest = EmbeddingChatRequest + +class PoolingCompletionRequest(EmbeddingCompletionRequest): + task: PoolingTask | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "If it is a classify or token_classify task, the default is True; " + "for other tasks, this value should be None.", + ) + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize, + use_activation=get_use_activation(self), + ) + + +class PoolingChatRequest(EmbeddingChatRequest): + task: PoolingTask | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "If it is a classify or token_classify task, the default is True; " + "for other tasks, this value should be None.", + ) + + def to_pooling_params(self): + return PoolingParams( + truncate_prompt_tokens=self.truncate_prompt_tokens, + dimensions=self.dimensions, + normalize=self.normalize, + use_activation=get_use_activation(self), + ) + T = TypeVar("T") @@ -1686,6 +1738,7 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]): """ data: T + task: PoolingTask = "plugin" encoding_format: EncodingFormat = "float" embed_dtype: EmbedDType = Field( default="float32", @@ -1749,14 +1802,27 @@ class ScoreRequest(OpenAIBaseModel): ), ) - activation: bool | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) # --8<-- [end:score-extra-params] def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - activation=self.activation, + use_activation=get_use_activation(self), ) @@ -1783,14 +1849,27 @@ class RerankRequest(OpenAIBaseModel): ), ) - activation: bool | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) # --8<-- [end:rerank-extra-params] def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - activation=self.activation, + use_activation=get_use_activation(self), ) @@ -1958,14 +2037,27 @@ class ClassificationRequest(OpenAIBaseModel): ), ) - activation: bool | None = None + softmax: bool | None = Field( + default=None, + description="softmax will be deprecated, please use use_activation instead.", + ) + activation: bool | None = Field( + default=None, + description="activation will be deprecated, please use use_activation instead.", + ) + + use_activation: bool | None = Field( + default=None, + description="Whether to use activation for classification outputs. " + "Default is True.", + ) # --8<-- [end:classification-extra-params] def to_pooling_params(self): return PoolingParams( truncate_prompt_tokens=self.truncate_prompt_tokens, - activation=self.activation, + use_activation=get_use_activation(self), ) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 568896ccbf1b7..0eade272111f1 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -170,15 +170,24 @@ class OpenAIServingPooling(OpenAIServing): pooling_params = request.to_pooling_params() pooling_task: PoolingTask - if "token_embed" in self.supported_tasks: - pooling_task = "token_embed" - elif "token_classify" in self.supported_tasks: - pooling_task = "token_classify" - elif "plugin" in self.supported_tasks: - pooling_task = "plugin" + if request.task is None: + if "token_embed" in self.supported_tasks: + pooling_task = "token_embed" + elif "token_classify" in self.supported_tasks: + pooling_task = "token_classify" + elif "plugin" in self.supported_tasks: + pooling_task = "plugin" + else: + return self.create_error_response( + f"pooling_task must be one of {self.supported_tasks}." + ) else: + pooling_task = request.task + + if pooling_task not in self.supported_tasks: return self.create_error_response( - f"pooling_task must be one of {self.supported_tasks}." + f"Task {pooling_task} is not supported, it" + f" must be one of {self.supported_tasks}." ) try: diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 145f18f235662..7dd02e32ff211 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -607,7 +607,7 @@ class ClassifierPooler(Pooler): pooled_data -= self.logit_bias pooling_params = get_pooling_params(pooling_metadata) - flags = [p.activation for p in pooling_params] + flags = [p.use_activation for p in pooling_params] if len(set(flags)) == 1: scores = self.act_fn(pooled_data) if flags[0] else pooled_data @@ -681,7 +681,7 @@ class TokenClassifierPoolerHead(nn.Module): if self.logit_bias is not None: scores -= self.logit_bias - if pooling_param.activation: + if pooling_param.use_activation: scores = self.act_fn(scores) # scores shape: [n_token, num_labels] diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index ac5949cda9de9..3bd02121f018e 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -53,8 +53,8 @@ class JambaForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: pooler_config = vllm_config.model_config.pooler_config - if pooler_config.activation is None: - pooler_config.activation = False + if pooler_config.use_activation is None: + pooler_config.use_activation = False class JinaRobertaModelConfig(VerifyAndUpdateConfig): diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 090d924144659..72a8320cc1bf8 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -2,16 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy -from typing import TYPE_CHECKING, Annotated, Any, Optional +from typing import Annotated, Any, Optional import msgspec +from vllm.config import ModelConfig, PoolerConfig +from vllm.config.pooler import get_use_activation from vllm.sampling_params import RequestOutputKind from vllm.tasks import PoolingTask -if TYPE_CHECKING: - from vllm.config import ModelConfig, PoolerConfig - class PoolingParams( msgspec.Struct, @@ -25,10 +24,12 @@ class PoolingParams( Set to -1 to use the model's default truncation size. Set to k to keep only the last k tokens (left truncation). Set to None to disable truncation. - normalize: Whether to normalize the embeddings outputs. dimensions: Reduce the dimensions of embeddings if model support matryoshka representation. - activation: Whether to apply activation function to + normalize: Whether to normalize the embeddings outputs. + softmax: softmax will be deprecated, please use use_activation instead. + activation: activation will be deprecated, please use use_activation instead. + use_activation: Whether to apply activation function to the classification outputs. """ @@ -44,7 +45,9 @@ class PoolingParams( ## for classification, scoring and rerank # --8<-- [start:classification-pooling-params] + softmax: bool | None = None activation: bool | None = None + use_activation: bool | None = None # --8<-- [end:classification-pooling-params] ## for step pooling models @@ -59,16 +62,16 @@ class PoolingParams( @property def all_parameters(self) -> list[str]: - return ["dimensions", "normalize", "activation"] + return ["dimensions", "normalize", "use_activation"] @property def valid_parameters(self): return { "embed": ["dimensions", "normalize"], - "classify": ["activation"], - "score": ["activation"], + "classify": ["use_activation"], + "score": ["use_activation"], "token_embed": ["dimensions", "normalize"], - "token_classify": ["activation"], + "token_classify": ["use_activation"], } def clone(self) -> "PoolingParams": @@ -84,6 +87,9 @@ class PoolingParams( msg = f"You cannot overwrite {self.task=!r} with {task=!r}!" raise ValueError(msg) + # raise deprecated warning for softmax and activation + self.use_activation = get_use_activation(self) + # plugin task uses io_processor.parse_request to verify inputs, # skipping PoolingParams verify if self.task == "plugin": @@ -168,8 +174,8 @@ class PoolingParams( raise ValueError("Dimensions must be greater than 0") elif self.task in ["classify", "score", "token_classify"]: - if self.activation is None: - self.activation = True + if self.use_activation is None: + self.use_activation = True else: raise ValueError(f"Unknown pooling task: {self.task}") @@ -197,7 +203,7 @@ class PoolingParams( f"task={self.task}, " f"normalize={self.normalize}, " f"dimensions={self.dimensions}, " - f"activation={self.activation}, " + f"use_activation={self.use_activation}, " f"step_tag_id={self.step_tag_id}, " f"returned_token_ids={self.returned_token_ids}, " f"requires_token_ids={self.requires_token_ids}, " From 1994de99ea0bf8dd84257a19800f4f62526a7edf Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Thu, 30 Oct 2025 05:27:53 -0700 Subject: [PATCH 020/976] [CI Failure] Fix test_kv_cache_model_load_and_run (#27717) Signed-off-by: Huamin Li <3ericli@gmail.com> --- tests/quantization/test_fp8.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 7f863a169d5f9..bb3572752d9e2 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -49,7 +49,18 @@ def test_model_load_and_run( KV_CACHE_MODELS = [ # AutoFP8 format using separate .k_scale and .v_scale - "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", + # The original checkpoint below was removed from the Hub. To unblock CI and + # until a small replacement with split K/V scales is found, skip this case. + # See PR #27717 for context. + pytest.param( + "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", + marks=pytest.mark.skip( + reason=( + "Checkpoint removed from HF; temporarily disabling this " + "AutoFP8 split K/V case (PR #27717)." + ) + ), + ), ] From 4e68cc9b6aa2b9cfe8d799c2b1cd156a01bca438 Mon Sep 17 00:00:00 2001 From: Zhiyuan Li Date: Thu, 30 Oct 2025 21:02:27 +0800 Subject: [PATCH 021/976] [Model] Introduce Kimi Linear to vLLM (#27809) Signed-off-by: lizhiyuan Signed-off-by: Zhiyuan Li --- docs/models/supported_models.md | 1 + tests/models/registry.py | 3 + vllm/config/compilation.py | 1 + vllm/config/model.py | 1 + vllm/model_executor/layers/fla/ops/kda.py | 2 +- vllm/model_executor/layers/kda.py | 426 +++++++++++ .../layers/mamba/mamba_utils.py | 41 ++ vllm/model_executor/layers/mla.py | 7 +- vllm/model_executor/models/config.py | 51 +- vllm/model_executor/models/kimi_linear.py | 663 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/config.py | 1 + vllm/transformers_utils/configs/__init__.py | 2 + .../transformers_utils/configs/kimi_linear.py | 144 ++++ vllm/v1/worker/gpu_model_runner.py | 29 +- 15 files changed, 1325 insertions(+), 48 deletions(-) create mode 100644 vllm/model_executor/layers/kda.py create mode 100644 vllm/model_executor/models/kimi_linear.py create mode 100644 vllm/transformers_utils/configs/kimi_linear.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 4d50c809d1966..c9744d31f0efc 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -382,6 +382,7 @@ th { | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | | `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | +| `KimiLinearForCausalLM` | Kimi-Linear-48B-A3B-Base, Kimi-Linear-48B-A3B-Instruct | `moonshotai/Kimi-Linear-48B-A3B-Base`, `moonshotai/Kimi-Linear-48B-A3B-Instruct` | | ✅︎ | | `Lfm2ForCausalLM` | LFM2 | `LiquidAI/LFM2-1.2B`, `LiquidAI/LFM2-700M`, `LiquidAI/LFM2-350M`, etc. | ✅︎ | ✅︎ | | `Lfm2MoeForCausalLM` | LFM2MoE | `LiquidAI/LFM2-8B-A1B-preview`, etc. | ✅︎ | ✅︎ | | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 17b1d7b527f6b..9a2a1eb5f1a74 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -296,6 +296,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "random": "ai21labs/Jamba-tiny-random", }, ), + "KimiLinearForCausalLM": _HfExamplesInfo( + "moonshotai/Kimi-Linear-48B-A3B-Instruct", trust_remote_code=True + ), "Lfm2ForCausalLM": _HfExamplesInfo("LiquidAI/LFM2-1.2B"), "Lfm2MoeForCausalLM": _HfExamplesInfo( "LiquidAI/LFM2-8B-A1B", min_transformers_version="4.58" diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index f3ed78779a995..6a5bd5ef4e07c 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -453,6 +453,7 @@ class CompilationConfig: "vllm::linear_attention", "vllm::plamo2_mamba_mixer", "vllm::gdn_attention", + "vllm::kda_attention", "vllm::sparse_attn_indexer", ] diff --git a/vllm/config/model.py b/vllm/config/model.py index 2151939d5a9f6..092c67e7bed8c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1236,6 +1236,7 @@ class ModelConfig: "deepseek_v32", "deepseek_mtp", "kimi_k2", + "kimi_linear", "longcat_flash", ): return self.hf_text_config.kv_lora_rank is not None diff --git a/vllm/model_executor/layers/fla/ops/kda.py b/vllm/model_executor/layers/fla/ops/kda.py index a10847d347d13..700f287ca4569 100644 --- a/vllm/model_executor/layers/fla/ops/kda.py +++ b/vllm/model_executor/layers/fla/ops/kda.py @@ -1304,7 +1304,7 @@ def kda_gate_fwd_kernel( tl.store(y_ptr, b_y.to(y.dtype.element_ty), boundary_check=(0, 1)) -def kda_gate_fwd( +def fused_kda_gate( g: torch.Tensor, A: torch.Tensor, head_k_dim: int, diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py new file mode 100644 index 0000000000000..c45e7546fac1e --- /dev/null +++ b/vllm/model_executor/layers/kda.py @@ -0,0 +1,426 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +from einops import rearrange +from torch import nn + +from vllm.attention import AttentionBackend +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config +from vllm.distributed import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.model_loader.weight_utils import sharded_weight_loader +from vllm.model_executor.utils import set_weight_attrs +from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata + +from .fla.ops.kda import ( + FusedRMSNormGated, + chunk_kda, + fused_kda_gate, + fused_recurrent_kda, +) +from .linear import ( + ColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) +from .mamba.abstract import MambaBase +from .mamba.mamba_utils import MambaStateDtypeCalculator, MambaStateShapeCalculator +from .mamba.ops.causal_conv1d import causal_conv1d_fn, causal_conv1d_update +from .quantization.base_config import QuantizationConfig + +logger = init_logger(__name__) + + +def kda_attention( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self._forward(hidden_states=hidden_states, output=output) + + +def kda_attention_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="kda_attention", + op_func=kda_attention, + mutates_args=["output"], + fake_impl=kda_attention_fake, +) + + +class KimiDeltaAttention(nn.Module, MambaBase): + @property + def mamba_type(self) -> str: + return "linear_attention" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend + + return GDNAttentionBackend + + def get_state_dtype( + self, + ) -> tuple[torch.dtype, torch.dtype, torch.dtype, torch.dtype]: + if self.model_config is None or self.cache_config is None: + raise ValueError("model_config and cache_config must be set") + return MambaStateDtypeCalculator.kda_state_dtype( + self.model_config.dtype, self.cache_config.mamba_cache_dtype + ) + + def get_state_shape( + self, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + return MambaStateShapeCalculator.kda_state_shape( + self.tp_size, self.num_heads, self.head_dim, conv_kernel_size=self.conv_size + ) + + def __init__( + self, + layer_idx: int, + hidden_size: int, + quant_config: QuantizationConfig | None = None, + cache_config: CacheConfig | None = None, + model_config: ModelConfig | None = None, + rms_norm_eps: float = 1e-5, + prefix: str = "", + **kwargs, + ) -> None: + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.hidden_size = hidden_size + self.model_config = model_config + self.cache_config = cache_config + if model_config is None: + raise ValueError("model_config must be provided") + kda_config = model_config.linear_attn_config + self.head_dim = kda_config["head_dim"] + self.num_heads = kda_config["num_heads"] + self.layer_idx = layer_idx + self.prefix = prefix + assert self.num_heads % self.tp_size == 0 + self.local_num_heads = divide(self.num_heads, self.tp_size) + + projection_size = self.head_dim * self.num_heads + self.conv_size = kda_config["short_conv_kernel_size"] + + self.q_proj = ColumnParallelLinear( + self.hidden_size, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_proj", + ) + self.k_proj = ColumnParallelLinear( + self.hidden_size, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.k_proj", + ) + self.v_proj = ColumnParallelLinear( + self.hidden_size, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.v_proj", + ) + + self.f_a_proj = ReplicatedLinear( + self.hidden_size, + self.head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.f_a_proj", + ) + + self.f_b_proj = ColumnParallelLinear( + self.head_dim, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.f_b_proj", + ) + self.dt_bias = nn.Parameter( + torch.empty(divide(projection_size, self.tp_size), dtype=torch.float32) + ) + + set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)}) + + self.b_proj = ColumnParallelLinear( + self.hidden_size, + self.num_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.b_proj", + ) + + self.q_conv1d = ColumnParallelLinear( + input_size=self.conv_size, + output_size=projection_size, + bias=False, + params_dtype=torch.float32, + prefix=f"{prefix}.q_conv1d", + ) + self.k_conv1d = ColumnParallelLinear( + input_size=self.conv_size, + output_size=projection_size, + bias=False, + params_dtype=torch.float32, + prefix=f"{prefix}.k_conv1d", + ) + self.v_conv1d = ColumnParallelLinear( + input_size=self.conv_size, + output_size=projection_size, + bias=False, + params_dtype=torch.float32, + prefix=f"{prefix}.v_conv1d", + ) + # unsqueeze to fit conv1d weights shape into the linear weights shape. + # Can't do this in `weight_loader` since it already exists in + # `ColumnParallelLinear` and `set_weight_attrs` + # doesn't allow to override it + self.q_conv1d.weight.data = self.q_conv1d.weight.data.unsqueeze(1) + self.k_conv1d.weight.data = self.k_conv1d.weight.data.unsqueeze(1) + self.v_conv1d.weight.data = self.v_conv1d.weight.data.unsqueeze(1) + + self.A_log = nn.Parameter( + torch.empty(1, 1, self.local_num_heads, 1, dtype=torch.float32) + ) + set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(2)}) + + self.g_a_proj = ReplicatedLinear( + self.hidden_size, + self.head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.g_a_proj", + ) + self.g_b_proj = ColumnParallelLinear( + self.head_dim, + projection_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.g_b_proj", + ) + self.o_norm = FusedRMSNormGated( + self.head_dim, eps=rms_norm_eps, activation="sigmoid" + ) + self.o_proj = RowParallelLinear( + projection_size, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + def forward( + self, + hidden_states: torch.Tensor, + positions: torch.Tensor, + output: torch.Tensor, + ) -> None: + return torch.ops.vllm.kda_attention( + hidden_states, + output, + self.prefix, + ) + + def _forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + ) -> None: + forward_context = get_forward_context() + attn_metadata: AttentionMetadata = forward_context.attn_metadata + + if attn_metadata is None: + # V1 profile run + # Mimic the memory allocation in the real run + q = torch.empty_like(hidden_states) + k = torch.empty_like(hidden_states) + v = torch.empty_like(hidden_states) + g = hidden_states.new_empty( + hidden_states.size(0), + self.local_num_heads, + self.head_dim, + dtype=torch.float32, + ) + beta = torch.empty( + hidden_states.size(0), self.local_num_heads, dtype=torch.float32 + ) + core_attn_out = torch.empty_like(hidden_states) + return + + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + assert isinstance(attn_metadata, GDNAttentionMetadata) + has_initial_state = attn_metadata.has_initial_state + non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc + non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501 + constant_caches = self.kv_cache[forward_context.virtual_engine] + + (conv_state_q, conv_state_k, conv_state_v, recurrent_state) = constant_caches + # deal with strides + conv_state_q = conv_state_q.transpose(-1, -2) + conv_state_k = conv_state_k.transpose(-1, -2) + conv_state_v = conv_state_v.transpose(-1, -2) + + q_proj_states = self.q_proj(hidden_states)[0] + k_proj_states = self.k_proj(hidden_states)[0] + v_proj_states = self.v_proj(hidden_states)[0] + + q_conv_weights = self.q_conv1d.weight.view( + self.q_conv1d.weight.size(0), self.q_conv1d.weight.size(2) + ) + k_conv_weights = self.k_conv1d.weight.view( + self.k_conv1d.weight.size(0), self.k_conv1d.weight.size(2) + ) + v_conv_weights = self.v_conv1d.weight.view( + self.v_conv1d.weight.size(0), self.v_conv1d.weight.size(2) + ) + if attn_metadata.num_prefills > 0: + q_proj_states = q_proj_states.transpose(0, 1) + k_proj_states = k_proj_states.transpose(0, 1) + v_proj_states = v_proj_states.transpose(0, 1) + q = causal_conv1d_fn( + q_proj_states, + q_conv_weights, + self.q_conv1d.bias, + activation="silu", + conv_states=conv_state_q, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + metadata=attn_metadata, + ).transpose(0, 1) + k = causal_conv1d_fn( + k_proj_states, + k_conv_weights, + self.k_conv1d.bias, + activation="silu", + conv_states=conv_state_k, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + metadata=attn_metadata, + ).transpose(0, 1) + v = causal_conv1d_fn( + v_proj_states, + v_conv_weights, + self.v_conv1d.bias, + activation="silu", + conv_states=conv_state_v, + has_initial_state=has_initial_state, + cache_indices=non_spec_state_indices_tensor, + query_start_loc=non_spec_query_start_loc, + metadata=attn_metadata, + ).transpose(0, 1) + else: + decode_conv_indices = non_spec_state_indices_tensor[ + : attn_metadata.num_decodes + ] + q = causal_conv1d_update( + q_proj_states, + conv_state_q, + q_conv_weights, + self.q_conv1d.bias, + activation="silu", + conv_state_indices=decode_conv_indices, + validate_data=True, + ) + k = causal_conv1d_update( + k_proj_states, + conv_state_k, + k_conv_weights, + self.k_conv1d.bias, + activation="silu", + conv_state_indices=decode_conv_indices, + validate_data=True, + ) + v = causal_conv1d_update( + v_proj_states, + conv_state_v, + v_conv_weights, + self.v_conv1d.bias, + activation="silu", + conv_state_indices=decode_conv_indices, + validate_data=True, + ) + + q, k, v = map( + lambda x: rearrange(x, "n (h d) -> 1 n h d", d=self.head_dim), (q, k, v) + ) + + beta = self.b_proj(hidden_states)[0].float().sigmoid() + + g = self.f_b_proj(self.f_a_proj(hidden_states)[0])[0] + g = fused_kda_gate(g, self.A_log, self.head_dim, g_bias=self.dt_bias) + + beta = beta.unsqueeze(0) + g = g.unsqueeze(0) + + if attn_metadata.num_prefills > 0: + zero_idx = non_spec_state_indices_tensor[~has_initial_state] + recurrent_state[zero_idx] = 0 + initial_state = recurrent_state[non_spec_state_indices_tensor].contiguous() + ( + core_attn_out_non_spec, + last_recurrent_state, + ) = chunk_kda( + q=q, + k=k, + v=v, + g=g, + beta=beta, + initial_state=initial_state, + output_final_state=True, + use_qk_l2norm_in_kernel=True, + cu_seqlens=non_spec_query_start_loc, + ) + # Init cache + recurrent_state[non_spec_state_indices_tensor] = last_recurrent_state + else: + ( + core_attn_out_non_spec, + last_recurrent_state, + ) = fused_recurrent_kda( + q=q, + k=k, + v=v, + g=g, + beta=beta, + initial_state=recurrent_state, + use_qk_l2norm_in_kernel=True, + cu_seqlens=non_spec_query_start_loc, + ssm_state_indices=non_spec_state_indices_tensor, + ) + + g_proj_states = self.g_b_proj(self.g_a_proj(hidden_states)[0])[0] + g = rearrange(g_proj_states, "... (h d) -> ... h d", d=self.head_dim) + core_attn_out = self.o_norm(core_attn_out_non_spec, g) + core_attn_out = rearrange(core_attn_out, "1 n h d -> n (h d)") + + output[:] = self.o_proj(core_attn_out)[0] diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py index 91a45623582d5..831dab2fbb01c 100644 --- a/vllm/model_executor/layers/mamba/mamba_utils.py +++ b/vllm/model_executor/layers/mamba/mamba_utils.py @@ -80,6 +80,15 @@ class MambaStateDtypeCalculator: state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) return (state_dtype, state_dtype) + @classmethod + def kda_state_dtype( + cls, + model_dtype: ModelDType | torch.dtype, + mamba_cache_dtype: MambaDType, + ): + state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) + return (state_dtype, state_dtype, state_dtype, torch.float32) + class MambaStateShapeCalculator: @classmethod @@ -182,3 +191,35 @@ class MambaStateShapeCalculator: head_v_dim, ) return conv_state_shape, temporal_state_shape + + @classmethod + def kda_state_shape( + cls, + tp_world_size: int, + num_heads: int, + head_dim: int, + num_k_heads: int | None = None, + head_k_dim: int | None = None, + conv_kernel_size: int = 4, + num_spec: int = 0, + ) -> tuple[tuple[int, int], tuple[int, int], tuple[int, int], tuple[int, int, int]]: + if num_k_heads is None: + num_k_heads = num_heads + if head_k_dim is None: + head_k_dim = head_dim + + proj_size = num_heads * head_dim + proj_k_size = num_k_heads * head_k_dim + + conv_state_shape = (divide(proj_size, tp_world_size), conv_kernel_size - 1) + conv_state_k_shape = (divide(proj_k_size, tp_world_size), conv_kernel_size - 1) + recurrent_state_shape = (divide(num_heads, tp_world_size), head_dim, head_dim) + + conv_state_shape = conv_state_shape[1], conv_state_shape[0] + conv_state_k_shape = conv_state_k_shape[1], conv_state_k_shape[0] + return ( + conv_state_shape, + conv_state_k_shape, + conv_state_k_shape, + recurrent_state_shape, + ) diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index 34f05f2ee9624..c4c44b83ae6bf 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -147,9 +147,10 @@ class MultiHeadLatentAttentionWrapper(CustomOp): # Add head dim of 1 to k_pe k_pe = k_pe.unsqueeze(1) - q[..., self.qk_nope_head_dim :], k_pe = self.rotary_emb( - positions, q[..., self.qk_nope_head_dim :], k_pe - ) + if self.rotary_emb is not None: + q[..., self.qk_nope_head_dim :], k_pe = self.rotary_emb( + positions, q[..., self.qk_nope_head_dim :], k_pe + ) if self.indexer and self.is_sparse: _topk_indices = self.indexer(hidden_states, q_c, positions, self.rotary_emb) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 3bd02121f018e..b0a48a9f1d458 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy +from math import lcm from typing import TYPE_CHECKING import vllm.envs as envs @@ -8,7 +9,7 @@ from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE -from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec +from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec if TYPE_CHECKING: from vllm.config import VllmConfig @@ -347,12 +348,28 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] # get attention page size (for 1 token) - attn_page_size_1_token = FullAttentionSpec( - block_size=1, - num_kv_heads=model_config.get_num_kv_heads(parallel_config), - head_size=model_config.get_head_size(), - dtype=kv_cache_dtype, - ).page_size_bytes + # Attention backend constraints: + # - FlashAttention (FA) requires block size to be multiple of 16 + # - MLA (Multi-head Latent Attention) requires larger alignment: + # * CUTLASS_MLA backend: kernel_block_size 128 alignment + # * Other MLA backends: kernel_block_size 64 alignment + if model_config.use_mla: + use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA" + kernel_block_alignment_size = 128 if use_cutlass_mla else 64 + attn_page_size_1_token = MLAAttentionSpec( + block_size=1, + num_kv_heads=model_config.get_num_kv_heads(parallel_config), + head_size=model_config.get_head_size(), + dtype=kv_cache_dtype, + ).page_size_bytes + else: + kernel_block_alignment_size = 16 + attn_page_size_1_token = FullAttentionSpec( + block_size=1, + num_kv_heads=model_config.get_num_kv_heads(parallel_config), + head_size=model_config.get_head_size(), + dtype=kv_cache_dtype, + ).page_size_bytes model_cls, _ = ModelRegistry.resolve_model_cls( model_config.architecture, @@ -372,17 +389,6 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): if mamba_page_size == 0: return - # Attention backend constraints: - # - FlashAttention (FA) requires block size to be multiple of 16 - # - MLA (Multi-head Latent Attention) requires larger alignment: - # * CUTLASS_MLA backend: 128-byte alignment - # * Other MLA backends: 64-byte alignment - if model_config.use_mla: - use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA" - kernel_block_alignment_size = 128 if use_cutlass_mla else 64 - else: - kernel_block_alignment_size = 16 - if cache_config.enable_prefix_caching: # With prefix caching, select attention block size to # optimize for mamba kernel performance @@ -400,15 +406,8 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): # easily by changing the way we layout chunks in the # mamba2 kernels. - from math import gcd - - def lcm(a, b): - return a * b // gcd(a, b) - - base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size() - + base_chunk_size = model_config.get_mamba_chunk_size() attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token) - chunk_size = lcm(base_chunk_size, kernel_block_alignment_size) attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size) cache_config.mamba_block_size = attn_block_size diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py new file mode 100644 index 0000000000000..a60a8d764d9d1 --- /dev/null +++ b/vllm/model_executor/models/kimi_linear.py @@ -0,0 +1,663 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig +from vllm.distributed import ( + get_pp_group, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.kda import KimiDeltaAttention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateDtypeCalculator, + MambaStateShapeCalculator, +) +from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttentionWrapper +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig + +from .interfaces import HasInnerState, IsHybrid, MixtureOfExperts, SupportsPP +from .utils import ( + PPMissingLayer, + is_pp_missing_parameter, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class KimiMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: QKVParallelLinear | None = None, + reduce_results: bool = True, + prefix: str = "", + ) -> None: + super().__init__() + + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {hidden_act}. Only silu is supported for now." + ) + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class KimiMoE(nn.Module): + def __init__( + self, + config: KimiLinearConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + layer_idx: int = 0, + ): + super().__init__() + hidden_size = config.hidden_size + intermediate_size = config.intermediate_size + moe_intermediate_size = config.moe_intermediate_size + num_experts = config.num_experts + moe_renormalize = config.moe_renormalize + self.tp_size = get_tensor_model_parallel_world_size() + self.routed_scaling_factor = config.routed_scaling_factor + self.num_shared_experts = config.num_shared_experts + self.layer_idx = layer_idx + + if config.hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now." + ) + + # Gate always runs at half / full precision for now. + self.gate = ReplicatedLinear( + hidden_size, + num_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate", + ) + + self.gate.e_score_correction_bias = nn.Parameter(torch.empty(num_experts)) + + self.experts = FusedMoE( + num_experts=num_experts, + top_k=config.num_experts_per_token, + hidden_size=hidden_size, + intermediate_size=moe_intermediate_size, + reduce_results=False, + renormalize=moe_renormalize, + quant_config=quant_config, + use_grouped_topk=config.use_grouped_topk, + num_expert_group=config.num_expert_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=config.moe_router_activation_func, + e_score_correction_bias=self.gate.e_score_correction_bias, + ) + + if self.num_shared_experts is not None: + intermediate_size = moe_intermediate_size * self.num_shared_experts + self.shared_experts = KimiMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_size = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_size) + if self.num_shared_experts is not None: + shared_output = self.shared_experts(hidden_states) + router_logits, _ = self.gate(hidden_states) + final_hidden_states = ( + self.experts(hidden_states=hidden_states, router_logits=router_logits) + * self.routed_scaling_factor + ) + if shared_output is not None: + final_hidden_states = final_hidden_states + shared_output + + if self.tp_size > 1: + final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) + return final_hidden_states.view(num_tokens, hidden_size) + + +class KimiMLAAttention(nn.Module): + """ + Main reference: DeepseekV2 vllm Implementation + """ + + def __init__( + self, + config: KimiLinearConfig, + hidden_size: int, + num_heads: int, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: int | None, + kv_lora_rank: int, + rope_theta: float = 10000, + use_nope: bool = False, + rope_scaling: dict[str, Any] | None = None, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + **kwargs, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim + self.v_head_dim = v_head_dim + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.num_heads = num_heads + tp_size = get_tensor_model_parallel_world_size() + self.num_local_heads = num_heads // tp_size + self.scaling = self.qk_head_dim**-0.5 + self.rope_theta = rope_theta + self.use_nope = use_nope + assert self.use_nope is True + assert self.q_lora_rank is None + assert rope_scaling is None + assert num_heads % tp_size == 0 + self.kv_a_proj_with_mqa = ReplicatedLinear( + self.hidden_size, + self.kv_lora_rank + self.qk_rope_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_a_proj_with_mqa", + ) + self.q_proj = ColumnParallelLinear( + self.hidden_size, + self.num_heads * self.qk_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_proj", + ) + self.kv_a_layernorm = RMSNorm( + self.kv_lora_rank, + eps=config.rms_norm_eps, + ) + self.kv_b_proj = ColumnParallelLinear( + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_b_proj", + ) + self.o_proj = RowParallelLinear( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + mla_modules = MLAModules( + kv_a_layernorm=self.kv_a_layernorm, + kv_b_proj=self.kv_b_proj, + rotary_emb=None, + o_proj=self.o_proj, + fused_qkv_a_proj=None, + kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, + q_a_layernorm=None, + q_b_proj=None, + q_proj=self.q_proj, + indexer=None, + is_sparse=False, + topk_indices_buffer=None, + ) + self.mla_attn = MultiHeadLatentAttentionWrapper( + self.hidden_size, + self.num_local_heads, + self.scaling, + self.qk_nope_head_dim, + self.qk_rope_head_dim, + self.v_head_dim, + self.q_lora_rank, + self.kv_lora_rank, + mla_modules, + cache_config, + quant_config, + prefix, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + output: torch.Tensor, + ) -> None: + output[:] = self.mla_attn(positions, hidden_states) + + +class KimiDecoderLayer(nn.Module): + def __init__( + self, + config: KimiLinearConfig, + layer_idx: int, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + parallel_config: ParallelConfig | None = None, + model_config: ModelConfig | None = None, + prefix: str = "", + **kwargs, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + + self.is_moe = config.is_moe + + if config.is_kda_layer(layer_idx): + self.self_attn = KimiDeltaAttention( + layer_idx=layer_idx, + hidden_size=config.hidden_size, + quant_config=quant_config, + cache_config=cache_config, + model_config=config, + prefix=f"{prefix}.self_attn", + ) + else: + self.self_attn = KimiMLAAttention( + layer_idx=layer_idx, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + quant_config=quant_config, + cache_config=cache_config, + model_config=model_config, + prefix=f"{prefix}.self_attn", + config=config, + qk_nope_head_dim=config.qk_nope_head_dim, + qk_rope_head_dim=config.qk_rope_head_dim, + v_head_dim=config.v_head_dim, + q_lora_rank=config.q_lora_rank, + kv_lora_rank=config.kv_lora_rank, + use_nope=config.mla_use_nope, + ) + + if ( + self.is_moe + and config.num_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0 + ): + self.block_sparse_moe = KimiMoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.mlp = self.block_sparse_moe + else: + self.mlp = KimiMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + **kwargs, + ) -> tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + attn_output = torch.empty_like(hidden_states) + self.self_attn( + hidden_states=hidden_states, + positions=positions, + output=attn_output, + ) + hidden_states = attn_output + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +@support_torch_compile +class KimiLinearModel(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_text_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + parallel_config = vllm_config.parallel_config + self.config = config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + prefix=f"{prefix}.embed_tokens", + ) + else: + self.embed_tokens = PPMissingLayer() + + extra_kwargs = {} + + def get_layer(prefix: str): + layer_idx = int(prefix.rsplit(".", 1)[1]) + return KimiDecoderLayer( + config, + layer_idx, + cache_config, + quant_config, + parallel_config, + model_config, + prefix, + **extra_kwargs, + ) + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + get_layer, + prefix=f"{prefix}.layers", + ) + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + world_size = get_tensor_model_parallel_world_size() + assert config.num_attention_heads % world_size == 0, ( + "num_attention_heads must be divisible by world_size" + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for _, layer in enumerate(self.layers[self.start_layer : self.end_layer]): + hidden_states, residual = layer( + positions=positions, + hidden_states=hidden_states, + residual=residual, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class KimiLinearForCausalLM( + nn.Module, HasInnerState, SupportsPP, MixtureOfExperts, IsHybrid +): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model_config = vllm_config.model_config + self.vllm_config = vllm_config + self.config = self.model_config.hf_config + quant_config = vllm_config.quant_config + self.quant_config = quant_config + self.model = KimiLinearModel( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead( + self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + else: + self.lm_head = PPMissingLayer() + logit_scale = getattr(self.config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor( + self.config.vocab_size, scale=logit_scale + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs + ) + return hidden_states + + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype, torch.dtype, torch.dtype]: + return MambaStateDtypeCalculator.kda_state_dtype( + vllm_config.model_config.dtype, vllm_config.cache_config.mamba_cache_dtype + ) + + @classmethod + def get_mamba_state_shape_from_config( + cls, vllm_config: "VllmConfig" + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + parallel_config = vllm_config.parallel_config + hf_config = vllm_config.model_config.hf_config + tp_size = parallel_config.tensor_parallel_size + num_spec = ( + vllm_config.speculative_config.num_speculative_tokens + if vllm_config.speculative_config + else 0 + ) + return MambaStateShapeCalculator.kda_state_shape( + tp_size, + hf_config.linear_attn_config["num_heads"], + hf_config.linear_attn_config["head_dim"], + conv_kernel_size=hf_config.linear_attn_config["short_conv_kernel_size"], + num_spec=num_spec, + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + return self.logits_processor(self.lm_head, hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + if self.config.is_moe: + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_experts, + ) + else: + expert_params_mapping = [] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for args in weights: + name, loaded_weight = args[:2] + kwargs = args[2] if len(args) > 2 else {} + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for idx, (param_name, weight_name, expert_id, shard_id) in enumerate( + expert_params_mapping + ): + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader( + param, + loaded_weight, + name, + expert_id=expert_id, + shard_id=shard_id, + ) + break + else: + # Skip loading extra bias for GPTQ models. + if ( + name.endswith(".bias") + and name not in params_dict + and not self.config.is_linear_attn + ): # noqa: E501 + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight, **kwargs) + loaded_params.add(name) + + +def get_spec_layer_idx_from_weight_name( + config: KimiLinearConfig, weight_name: str +) -> int | None: + if hasattr(config, "num_nextn_predict_layers") and ( + config.num_nextn_predict_layers > 0 + ): + layer_idx = config.num_hidden_layers + for i in range(config.num_nextn_predict_layers): + if weight_name.startswith(f"model.layers.{layer_idx + i}."): + return layer_idx + i + return None diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 0027954ac2771..8e4413c90cf6c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -118,6 +118,7 @@ _TEXT_GENERATION_MODELS = { "InternLM3ForCausalLM": ("llama", "LlamaForCausalLM"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), + "KimiLinearForCausalLM": ("kimi_linear", "KimiLinearForCausalLM"), # noqa: E501 "Lfm2ForCausalLM": ("lfm2", "Lfm2ForCausalLM"), "Lfm2MoeForCausalLM": ("lfm2_moe", "Lfm2MoeForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 34c0429a80679..b1f4e3e2a9831 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -79,6 +79,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( deepseek_v3="DeepseekV3Config", deepseek_v32="DeepseekV3Config", flex_olmo="FlexOlmoConfig", + kimi_linear="KimiLinearConfig", kimi_vl="KimiVLConfig", Llama_Nemotron_Nano_VL="Nemotron_Nano_VL_Config", RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index befe9cdae76a1..663a8e44d71dd 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -19,6 +19,7 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig from vllm.transformers_utils.configs.jais import JAISConfig +from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig from vllm.transformers_utils.configs.lfm2_moe import Lfm2MoeConfig from vllm.transformers_utils.configs.medusa import MedusaConfig @@ -54,6 +55,7 @@ __all__ = [ "MiDashengLMConfig", "MLPSpeculatorConfig", "MoonViTConfig", + "KimiLinearConfig", "KimiVLConfig", "NemotronConfig", "NemotronHConfig", diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py new file mode 100644 index 0000000000000..65ddf48c5249b --- /dev/null +++ b/vllm/transformers_utils/configs/kimi_linear.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from transformers.configuration_utils import PretrainedConfig + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class KimiLinearConfig(PretrainedConfig): + model_type = "kimi_linear" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + model_type="kimi_linear", + vocab_size=163840, + hidden_size=4096, + head_dim=None, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + rope_theta=10000.0, + rope_scaling=None, + tie_word_embeddings=False, + moe_intermediate_size: int | None = None, + moe_renormalize: bool = True, + moe_router_activation_func: str = "sigmoid", + num_experts: int | None = None, + num_experts_per_token: int | None = None, + num_shared_experts: int = 0, + routed_scaling_factor: float = 1.0, + first_k_dense_replace: int = 0, + moe_layer_freq: int = 1, + use_grouped_topk: bool = True, + num_expert_group: int = 1, + topk_group: int = 1, + q_lora_rank: int | None = None, + kv_lora_rank: int | None = None, + qk_nope_head_dim: int | None = None, + qk_rope_head_dim: int | None = None, + v_head_dim: int | None = None, + mla_use_nope: bool | None = False, + num_nextn_predict_layers: int = 0, + linear_attn_config: dict | None = None, + **kwargs, + ): + self.model_type = model_type + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.head_dim = ( + head_dim if head_dim is not None else hidden_size // num_attention_heads + ) + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.mla_use_nope = mla_use_nope + # moe config + self.num_experts = num_experts + self.num_experts_per_token = num_experts_per_token + self.moe_renormalize = moe_renormalize + self.num_shared_experts = num_shared_experts + self.routed_scaling_factor = routed_scaling_factor + self.moe_router_activation_func = moe_router_activation_func + assert self.moe_router_activation_func in ("softmax", "sigmoid") + self.moe_intermediate_size = moe_intermediate_size + self.first_k_dense_replace = first_k_dense_replace + self.moe_layer_freq = moe_layer_freq + self.use_grouped_topk = use_grouped_topk + self.num_expert_group = num_expert_group + self.topk_group = topk_group + self.num_nextn_predict_layers = num_nextn_predict_layers + + if linear_attn_config is not None: + assert linear_attn_config["kda_layers"] is not None + assert linear_attn_config["full_attn_layers"] is not None + self.linear_attn_config = linear_attn_config + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + @property + def is_mla(self): + return ( + self.q_lora_rank is not None + or self.kv_lora_rank is not None + or self.qk_nope_head_dim is not None + or self.qk_rope_head_dim is not None + or self.v_head_dim is not None + or self.mla_use_nope is True + ) + + @property + def is_moe(self): + return self.num_experts is not None + + @property + def is_linear_attn(self) -> bool: + return not ( + self.linear_attn_config is None + or ( + isinstance(self.linear_attn_config, dict) + and self.linear_attn_config["kda_layers"] is not None + and len(self.linear_attn_config["kda_layers"]) == 0 + ) + ) + + def is_kda_layer(self, layer_idx: int): + return ( + self.linear_attn_config is not None + and (layer_idx + 1) in self.linear_attn_config["kda_layers"] + ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1fe749c614ccf..729ce462cf186 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -8,6 +8,7 @@ from collections import defaultdict from collections.abc import Iterator from contextlib import contextmanager from copy import deepcopy +from functools import reduce from itertools import product from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, cast @@ -4134,26 +4135,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def calculate_reorder_batch_threshold(self) -> None: """ - Check that if any backends reorder batches; that the reordering - is compatible (e.g., decode threshold is the same) + Choose the minimum reorder batch threshold from all attention groups. + Backends should be able to support lower threshold then what they request + just may have a performance penalty due to that backend treating decodes + as prefills. """ - for group in self._attn_group_iterator(): - attn_metadata_builder_i = group.get_metadata_builder() + min_none_high = lambda a, b: a if b is None else b if a is None else min(a, b) - # check that if any backends reorder batches; that the reordering - # is compatible (e.g., decode threshold is the same) - reorder_batch_threshold_i = attn_metadata_builder_i.reorder_batch_threshold - if reorder_batch_threshold_i is not None: - if self.reorder_batch_threshold is not None: - if reorder_batch_threshold_i != self.reorder_batch_threshold: - raise ValueError( - f"Attention backend reorders decodes with " - f"threshold {reorder_batch_threshold_i} but other " - f"backend uses threshold " - f"{self.reorder_batch_threshold}" - ) - else: - self.reorder_batch_threshold = reorder_batch_threshold_i + reorder_batch_thresholds = [ + group.get_metadata_builder().reorder_batch_threshold + for group in self._attn_group_iterator() + ] + self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds) def _find_compatible_block_sizes( self, From 0fe01404082744c955d135c3634e17de1404b00c Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Thu, 30 Oct 2025 07:10:29 -0700 Subject: [PATCH 022/976] [KV offload] Enable CPU KV offload on CUDA alike Platforms (#27770) Signed-off-by: zhewenli --- tests/v1/kv_offload/test_cpu_offloading.py | 4 ---- vllm/v1/kv_offload/cpu.py | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index a5cb23c4ef0f2..b654ea4298dbb 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -12,7 +12,6 @@ from tqdm import tqdm from vllm import LLM, SamplingParams, TokensPrompt from vllm.config import KVEventsConfig, KVTransferConfig from vllm.distributed.kv_events import BlockStored, KVEventBatch -from vllm.platforms import current_platform CPU_BLOCK_SIZES = [16, 48] @@ -64,9 +63,6 @@ class MockSubscriber: self.sub.close() -@pytest.mark.skipif( - not current_platform.is_cuda(), reason="CPU offloading only supported on CUDA" -) @pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES) def test_cpu_offloading(cpu_block_size: int) -> None: """ diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py index 250ed5e95af4b..f765d19ea0175 100644 --- a/vllm/v1/kv_offload/cpu.py +++ b/vllm/v1/kv_offload/cpu.py @@ -51,9 +51,9 @@ class CPUOffloadingSpec(OffloadingSpec): self, kv_caches: dict[str, torch.Tensor] ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: if not self._handler: - if not current_platform.is_cuda(): + if not current_platform.is_cuda_alike(): raise Exception( - "CPU Offloading is currently only supported on CUDA GPUs" + "CPU Offloading is currently only supported on CUDA-alike GPUs" ) layer_names = list(kv_caches.keys()) From 9956aae4ead0906abe7a1840a503587cab2013c1 Mon Sep 17 00:00:00 2001 From: Fan Yin <1106310035@qq.com> Date: Thu, 30 Oct 2025 22:34:41 +0800 Subject: [PATCH 023/976] [Model][Ouro] Support Ouro Model (#27794) Signed-off-by: yinfan.1024 Signed-off-by: youkaichao Co-authored-by: yinfan.1024 Co-authored-by: youkaichao Co-authored-by: Jee Jee Li --- docs/models/supported_models.md | 1 + tests/models/registry.py | 1 + vllm/model_executor/models/ouro.py | 518 +++++++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 4 files changed, 521 insertions(+) create mode 100644 vllm/model_executor/models/ouro.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index c9744d31f0efc..fd25647dce54b 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -403,6 +403,7 @@ th { | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | +| `OuroForCausalLM` | ouro | `ByteDance/Ouro-1.4B`, `ByteDance/Ouro-2.6B`, etc. | ✅︎ | | | `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | | `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 9a2a1eb5f1a74..7b5977ec58e53 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -369,6 +369,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "OrionForCausalLM": _HfExamplesInfo( "OrionStarAI/Orion-14B-Chat", trust_remote_code=True ), + "OuroForCausalLM": _HfExamplesInfo("ByteDance/Ouro-1.4B", trust_remote_code=True), "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"), "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"), "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"), diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py new file mode 100644 index 0000000000000..b8dad909c5470 --- /dev/null +++ b/vllm/model_executor/models/ouro.py @@ -0,0 +1,518 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py +# Copyright 2024 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Ouro model compatible with HuggingFace weights.""" + +from collections.abc import Iterable +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.attention import Attention, AttentionType +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsLoRA +from .utils import ( + AutoWeightsLoader, + extract_layer_index, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + + +class OuroMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "silu": + raise ValueError( + f"Unsupported activation: {hidden_act}. Only silu is supported for now." + ) + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class OuroAttention(nn.Module): + def __init__( + self, + config: PretrainedConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 4096 * 32, + rope_theta: float = 10000, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + rope_scaling: tuple | None = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + dual_chunk_attention_config: dict[str, Any] | None = None, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.dual_chunk_attention_config = dual_chunk_attention_config + + # Get total_ut_steps from config, default to 4 if not specified + total_ut_steps = getattr(config, "total_ut_steps", 4) + + # Use total number of hidden layers instead of hardcoded 24 + total_layers = config.num_hidden_layers + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=self.rope_theta, + rope_scaling=rope_scaling, + dual_chunk_attention_config=dual_chunk_attention_config, + ) + self.attn = nn.ModuleList() + for ut_step in range(total_ut_steps): + base_layer_idx = extract_layer_index(prefix) + unique_layer_idx = ut_step * total_layers + base_layer_idx + + unique_prefix = prefix.replace( + f"layers.{base_layer_idx}", f"layers.{unique_layer_idx}" + ) + + self.attn.append( + Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + attn_type=attn_type, + prefix=f"{unique_prefix}.attn", + **{ + "layer_idx": unique_layer_idx, + "dual_chunk_attention_config": dual_chunk_attention_config, + } + if dual_chunk_attention_config + else {}, + ) + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + current_ut: int, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn[current_ut](q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class OuroDecoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + # Requires transformers > 4.32.0 + rope_theta = getattr(config, "rope_theta", 1000000) + rope_scaling = getattr(config, "rope_scaling", None) + dual_chunk_attention_config = getattr( + config, "dual_chunk_attention_config", None + ) + + if getattr(config, "is_causal", True): + attn_type = AttentionType.DECODER + else: + attn_type = AttentionType.ENCODER_ONLY + + self.self_attn = OuroAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + max_position=config.max_position_embeddings, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + cache_config=cache_config, + quant_config=quant_config, + rope_scaling=rope_scaling, + prefix=f"{prefix}.self_attn", + attn_type=attn_type, + dual_chunk_attention_config=dual_chunk_attention_config, + ) + self.mlp = OuroMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.input_layernorm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.post_attention_layernorm = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.post_attention_layernorm_2 = RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + current_ut: int, + residual: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, hidden_states=hidden_states, current_ut=current_ut + ) + hidden_states = self.input_layernorm_2(hidden_states) + + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_attention_layernorm_2(hidden_states) + + return hidden_states, residual + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + } +) +class OuroModel(nn.Module): + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + decoder_layer_type: type[nn.Module] = OuroDecoderLayer, + ): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + # TODO (@robertgshaw2): see if this can be moved out + if cache_config.sliding_window is not None and hasattr( + config, "max_window_layers" + ): + assert config.max_window_layers == config.num_hidden_layers, ( + "Sliding window for some but all layers is not supported. " + "This model uses sliding window but `max_window_layers` = {} " + "is less than `num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( + config.max_window_layers, + config.num_hidden_layers, + ) + ) + + self.config = config + self.quant_config = quant_config + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", + ) + + # Use the provided decoder layer type or default to OuroDecoderLayer + decoder_layer_type = decoder_layer_type or OuroDecoderLayer + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: decoder_layer_type( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ), + prefix=f"{prefix}.layers", + ) + + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.early_exit_gate = RowParallelLinear(config.hidden_size, 1, bias=True) + + self.total_ut_steps = getattr(self.config, "total_ut_steps", 4) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + + for current_ut in range(self.total_ut_steps): + residual = None + for layer in self.layers[self.start_layer : self.end_layer]: + hidden_states, residual = layer( + positions, hidden_states, current_ut, residual + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + loaded_weight = ( + loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] + ) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class OuroForCausalLM(nn.Module, SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.lora_config = lora_config + + self.quant_config = quant_config + self.model = OuroModel( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + + self.logits_processor = LogitsProcessor(config.vocab_size) + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor | IntermediateTensors: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8e4413c90cf6c..7eca1a09e5365 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -148,6 +148,7 @@ _TEXT_GENERATION_MODELS = { "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), + "OuroForCausalLM": ("ouro", "OuroForCausalLM"), "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), From eebf00cb0c925404672d407674b319ebc5ae3a84 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 30 Oct 2025 23:12:05 +0800 Subject: [PATCH 024/976] [Bugfix][CPU] Fix MRoPE dispatch on the CPU backend (#27800) Signed-off-by: jiang1.li --- vllm/model_executor/layers/rotary_embedding/mrope.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 3c184ce9d6316..0592aa8f967a6 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -357,6 +357,15 @@ class MRotaryEmbedding(RotaryEmbeddingBase): key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) return query, key + def forward_cpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + offsets: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + return self.forward_native(positions, query, key, offsets) + @staticmethod def get_next_input_positions( mrope_position_delta: int, From e5e076cad7c1c922fa6d48049c45bead505f52a6 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Thu, 30 Oct 2025 11:24:31 -0400 Subject: [PATCH 025/976] [BugFix] Stopgap - Flashinfer Autotuner + GPT-OSS + DP/TP (#27762) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- vllm/model_executor/warmup/kernel_warmup.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 79d1927d32103..ffa3bc8f021ef 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -11,7 +11,7 @@ from typing import TYPE_CHECKING import torch import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig from vllm.logger import init_logger from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup from vllm.platforms import current_platform @@ -30,13 +30,19 @@ def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool: Record known issues with vllm + flashinfer autotune here. Return True if and only if flashinfer autotune will run through without issues. """ - return not ( - vllm_config.parallel_config.data_parallel_size > 1 - and ( - envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - ) + is_tp_or_dp = (vllm_config.parallel_config.data_parallel_size > 1) or ( + vllm_config.parallel_config.tensor_parallel_size > 1 ) + is_fi_mxfp4_backend = ( + envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS + ) or ( + current_platform.is_cuda() and current_platform.is_device_capability(100) + ) # on >=sm100, default mxfp4 backend is flashinfer + is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE + + return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager) def kernel_warmup(worker: "Worker"): From 60f76baa6688ce265a4205f183bd42a62d8f7179 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Thu, 30 Oct 2025 16:41:44 +0100 Subject: [PATCH 026/976] [Misc] Replace CUDA_VISIBLE_DEVICES in DP with torch.cuda.set_device for device selection on cuda-like devices (#27564) Signed-off-by: ilmarkov Co-authored-by: Tyler Michael Smith --- .../kv_connector/v1/nixl_connector.py | 12 ++++++---- vllm/v1/engine/utils.py | 11 ++++++++- vllm/v1/worker/dp_utils.py | 4 ++-- vllm/v1/worker/gpu_worker.py | 23 +++++++++++++++++++ 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 275a8c734058b..d5712bdd9feb4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1008,11 +1008,14 @@ class NixlConnectorWorker: # Enable different block lengths for different layers when MLA is used. self.block_len_per_layer = list[int]() self.slot_size_per_layer = list[int]() # HD bytes in kv terms + self.device_id = self.tp_rank for layer_name, cache_or_caches in xfer_buffers.items(): cache_list = cache_or_caches if split_k_and_v else [cache_or_caches] for cache in cache_list: base_addr = cache.data_ptr() + if not self.use_host_buffer and current_platform.is_cuda_alike(): + self.device_id = cache.device.index if base_addr in seen_base_addresses: continue @@ -1040,7 +1043,7 @@ class NixlConnectorWorker: "All kv cache tensors must have the same size" ) caches_data.append( - (base_addr, curr_tensor_size_bytes, self.tp_rank, "") + (base_addr, curr_tensor_size_bytes, self.device_id, "") ) logger.debug( @@ -1087,7 +1090,7 @@ class NixlConnectorWorker: block_offset = block_id * self.block_len_per_layer[i] addr = base_addr + block_offset # (addr, len, device id) - blocks_data.append((addr, kv_block_len, self.tp_rank)) + blocks_data.append((addr, kv_block_len, self.device_id)) if self._use_flashinfer: # Separate and interleave K/V regions to maintain the same @@ -1098,12 +1101,13 @@ class NixlConnectorWorker: addr = base_addr + block_offset # Register addresses for V cache (K registered first). v_addr = addr + kv_block_len - blocks_data.append((v_addr, kv_block_len, self.tp_rank)) + blocks_data.append((v_addr, kv_block_len, self.device_id)) logger.debug( - "Created %s blocks for src engine %s and rank %s", + "Created %s blocks for src engine %s and rank %s on device id %s", len(blocks_data), self.engine_id, self.tp_rank, + self.device_id, ) descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index bdc124b0571c0..e74519b21aa6e 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -134,9 +134,18 @@ class CoreEngineProcManager: data_parallel = vllm_config.parallel_config.data_parallel_size > 1 try: for proc, local_dp_rank in zip(self.processes, local_dp_ranks): + # Adjust device control in DP for non-CUDA platforms + # as well as external and ray launchers + # For CUDA platforms, we use torch.cuda.set_device() with ( set_device_control_env_var(vllm_config, local_dp_rank) - if (data_parallel) + if ( + data_parallel + and ( + not current_platform.is_cuda_alike() + or vllm_config.parallel_config.use_ray + ) + ) else contextlib.nullcontext() ): proc.start() diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 2b2a69f4af3ab..464fbf11a21ad 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -8,7 +8,6 @@ import torch.distributed as dist from vllm.config import ParallelConfig from vllm.distributed.parallel_state import get_dp_group from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.v1.worker.ubatch_utils import ( UBatchSlices, check_ubatch_thresholds, @@ -20,7 +19,8 @@ logger = init_logger(__name__) def _get_device_and_group(parallel_config: ParallelConfig): - device = current_platform.device_type + # Use the actual device assigned to the DP group, not just the device type + device = get_dp_group().device group = get_dp_group().device_group # Transfering this tensor from GPU to CPU will introduce a GPU sync diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 29b6532e4366f..54c5f81fc7e8e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -172,6 +172,29 @@ class Worker(WorkerBase): if self.device_config.device.type == "cuda": # This env var set by Ray causes exceptions with graph building. os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + if ( + self.parallel_config.data_parallel_size > 1 + and self.parallel_config.data_parallel_size_local > 0 + and self.parallel_config.distributed_executor_backend + not in ["ray", "external_launcher"] + and self.vllm_config.parallel_config.data_parallel_backend != "ray" + ): + # Use local DP rank if available, otherwise use global DP rank. + dp_local_rank = self.parallel_config.data_parallel_rank_local + if dp_local_rank is None: + dp_local_rank = self.parallel_config.data_parallel_rank + + tp_pp_world_size = ( + self.parallel_config.pipeline_parallel_size + * self.parallel_config.tensor_parallel_size + ) + + # DP_LOCAL_RANK * TP_PP_WORLD_SIZE + TP_LOCAL_RANK + self.local_rank += dp_local_rank * tp_pp_world_size + assert self.local_rank < torch.cuda.device_count(), ( + f"DP adjusted local rank {self.local_rank} is out of bounds. " + ) + self.device = torch.device(f"cuda:{self.local_rank}") current_platform.set_device(self.device) From 33a0ea5f3264b5b2f571b8a53357e10efcc94670 Mon Sep 17 00:00:00 2001 From: Kebe Date: Fri, 31 Oct 2025 01:33:13 +0900 Subject: [PATCH 027/976] [Docs] add Shanghai Meetup - 2025/10 (#27545) Signed-off-by: Kebe Signed-off-by: esmeetu Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: esmeetu --- README.md | 1 + docs/community/meetups.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 3dcdd7dc00942..2e750ef8fc894 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio *Latest News* 🔥 +- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6). - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing). - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA). - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing). diff --git a/docs/community/meetups.md b/docs/community/meetups.md index e821e2ac81149..0dfc582c7f8a7 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -2,6 +2,7 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6) - [vLLM Toronto Meetup](https://luma.com/e80e0ymm), September 25th 2025. [[Slides]](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing) - [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA) - [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing) From ba33e8830dceb32e9b03508bbff435e3082759b8 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 30 Oct 2025 10:22:30 -0700 Subject: [PATCH 028/976] Reapply "Install pre-built xformers-0.0.32.post2 built with pt-2.9.0" (#27768) Signed-off-by: Huy Do --- docker/Dockerfile | 7 ------- requirements/cuda.txt | 4 ++-- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 42a830cb605ad..61ebf970fe960 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -361,13 +361,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist && uv pip install --system dist/*.whl --verbose \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') -# TODO (huydhn): Remove this once xformers is released for 2.9.0 -RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' - . /etc/environment - export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a' - uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2" -BASH - # Install FlashInfer pre-compiled kernel cache and binaries # https://docs.flashinfer.ai/installation.html RUN --mount=type=cache,target=/root/.cache/uv \ diff --git a/requirements/cuda.txt b/requirements/cuda.txt index dd45eb832a96a..5f7d520cd3662 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -9,7 +9,7 @@ torch==2.9.0 torchaudio==2.9.0 # These must be updated alongside torch torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1 -# xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8 +# Build from https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1 +xformers==0.0.33+5d4b92a5.d20251029; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9 # FlashInfer should be updated together with the Dockerfile flashinfer-python==0.4.1 From 10042057953cd1528701234925de3d7b109e26de Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Fri, 31 Oct 2025 01:27:39 +0800 Subject: [PATCH 029/976] [MTP] Refactor mtp predictor to avoid d2h operation (#27643) Signed-off-by: MengqingCao --- vllm/model_executor/models/deepseek_mtp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index aa176ef05fccb..3984d23970ac5 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -97,7 +97,7 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module): ) -> torch.Tensor: assert inputs_embeds is not None # masking inputs at position 0, as not needed by MTP - inputs_embeds[positions == 0] = 0 + inputs_embeds = torch.where(positions.unsqueeze(-1) == 0, 0, inputs_embeds) inputs_embeds = self.enorm(inputs_embeds) previous_hidden_states = self.hnorm(previous_hidden_states) From 2918c1b49c88c29783c86f78d2c4221cb9622379 Mon Sep 17 00:00:00 2001 From: Roger Meier Date: Fri, 31 Oct 2025 01:36:56 +0800 Subject: [PATCH 030/976] [Model] Use the same fused_moe configs for all H200 devices (#23642) Signed-off-by: Roger Meier --- vllm/model_executor/layers/fused_moe/fused_moe.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 5f9bfd6d9cf7d..d0f5eb498127b 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -818,6 +818,9 @@ def get_config_file_name( E: int, N: int, dtype: str | None, block_shape: list[int] | None = None ) -> str: device_name = current_platform.get_device_name().replace(" ", "_") + # Set device_name to H200 if a device from the H200 family is detected + if "H200" in device_name: + device_name = "H200" dtype_selector = "" if not dtype else f",dtype={dtype}" block_shape_selector = ( "" if not block_shape or not all(block_shape) else f",block_shape={block_shape}" From ab98f6556ff84508cdcdcd6a6b1e612a7a8819d0 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 30 Oct 2025 14:52:18 -0400 Subject: [PATCH 031/976] [Bugfix] Fix 2 precommit issues - (mamba_block_size, kv_cache_config) (#27811) Signed-off-by: Tyler Michael Smith Signed-off-by: Tyler Michael Smith Co-authored-by: Nick Hill --- vllm/model_executor/models/config.py | 2 +- vllm/v1/core/sched/scheduler.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index b0a48a9f1d458..7150977e9266b 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -406,7 +406,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): # easily by changing the way we layout chunks in the # mamba2 kernels. - base_chunk_size = model_config.get_mamba_chunk_size() + base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size() attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token) chunk_size = lcm(base_chunk_size, kernel_block_alignment_size) attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index ad6fbee2ec083..98c8f08b0aae8 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -13,7 +13,7 @@ from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, - supports_hma, + SupportsHMA, ) from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats from vllm.logger import init_logger @@ -93,7 +93,11 @@ class Scheduler(SchedulerInterface): ) connector_vllm_config = copy.copy(self.vllm_config) - connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config) + + # We're dynamically inserting a kv_cache_config variable into the + # connector_vllm_config. This is distinct from the cache_config + # that is already in there. + connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config) # type: ignore[attr-defined] self.connector = KVConnectorFactory.create_connector( config=connector_vllm_config, role=KVConnectorRole.SCHEDULER ) @@ -1327,15 +1331,15 @@ class Scheduler(SchedulerInterface): block_ids = self.kv_cache_manager.get_block_ids(request.request_id) - if not supports_hma(self.connector): + if not isinstance(self.connector, SupportsHMA): # NOTE(Kuntai): We should deprecate this code path after we enforce # all connectors to support HMA. # Hybrid memory allocator should be already turned off for this # code path, but let's double-check here. assert len(self.kv_cache_config.kv_cache_groups) == 1 return self.connector.request_finished(request, block_ids[0]) - else: - return self.connector.request_finished(request, block_ids) + + return self.connector.request_finished_all_groups(request, block_ids) def _update_waiting_for_remote_kv(self, request: Request) -> bool: """ From 4574d48bab9c4e38b7c0a830eeefc8f0980e8c58 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Thu, 30 Oct 2025 11:52:36 -0700 Subject: [PATCH 032/976] [Core][Bookkeeping] Update cu_num_accepted_tokens for all req_index (#27629) Signed-off-by: Jialin Ouyang --- vllm/v1/worker/gpu_model_runner.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 729ce462cf186..04814b5991ebc 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2323,11 +2323,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None else: sampled_ids = valid_sampled_token_ids[req_idx] + + num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0 + + if cu_num_accepted_tokens is not None: + cu_num_accepted_tokens.append( + cu_num_accepted_tokens[-1] + num_sampled_ids + ) + if not sampled_ids: continue start_idx = self.input_batch.num_tokens_no_spec[req_idx] - end_idx = start_idx + len(sampled_ids) + end_idx = start_idx + num_sampled_ids assert end_idx <= self.max_model_len, ( "Sampled token IDs exceed the max model length. " f"Total number of tokens: {end_idx} > max_model_len: " @@ -2343,11 +2351,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_state = self.requests[req_id] req_state.output_token_ids.extend(sampled_ids) - if cu_num_accepted_tokens is not None: - cu_num_accepted_tokens.append( - cu_num_accepted_tokens[-1] + len(sampled_ids) - ) - logprobs_lists = ( logprobs_tensors.tolists(cu_num_accepted_tokens) if not self.use_async_scheduling and logprobs_tensors is not None From a2981c42720a34b5abf59c4c14df701f8105d4cd Mon Sep 17 00:00:00 2001 From: cong-meta Date: Thu, 30 Oct 2025 12:10:16 -0700 Subject: [PATCH 033/976] [EP/DP][API Server] Enable DP-aware routing in OpenAI API requests (#24945) Co-authored-by: Cong Chen --- tests/entrypoints/openai/test_serving_chat.py | 76 +++++++++++++++++++ vllm/entrypoints/openai/serving_chat.py | 4 + vllm/entrypoints/openai/serving_completion.py | 4 + vllm/entrypoints/openai/serving_engine.py | 15 ++++ 4 files changed, 99 insertions(+) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index d1367b4eeaf62..1b83ed7e31e78 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -651,3 +651,79 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): await serving_chat.create_chat_completion(req) engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1] assert engine_prompt.get("cache_salt") == "test_salt" + + +@pytest.mark.asyncio +async def test_serving_chat_data_parallel_rank_extraction(): + """Test that data_parallel_rank is properly extracted from header and + passed to engine.""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.processor = MagicMock() + mock_engine.io_processor = MagicMock() + + # Mock the generate method to return an async generator + async def mock_generate(*args, **kwargs): + # Yield a fake RequestOutput + from vllm.outputs import CompletionOutput, RequestOutput + + yield RequestOutput( + request_id="test-request", + prompt="test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[ + CompletionOutput( + index=0, + text="test response", + token_ids=[4, 5, 6], + cumulative_logprob=0.0, + logprobs=None, + finish_reason="stop", + stop_reason=None, + ) + ], + finished=True, + ) + + mock_engine.generate = AsyncMock(side_effect=mock_generate) + + serving_chat = _build_serving_chat(mock_engine) + + # Test when data_parallel_rank is present in header + req = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": "what is 1+1?"}], + ) + + # Mock request with X-data-parallel-rank header + mock_raw_request = MagicMock() + mock_raw_request.headers = {"X-data-parallel-rank": "2"} + mock_raw_request.state = MagicMock() + + with suppress(Exception): + await serving_chat.create_chat_completion(req, mock_raw_request) + + # Verify that data_parallel_rank was passed to engine.generate + assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs + assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] == 2 + + # Test when data_parallel_rank is not present (defaults to None) + req_no_dp = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": "what is 2+2?"}], + ) + + # Mock request with no header + mock_raw_request_no_dp = MagicMock() + mock_raw_request_no_dp.headers = {} + mock_raw_request_no_dp.state = MagicMock() + + with suppress(Exception): + await serving_chat.create_chat_completion(req_no_dp, mock_raw_request_no_dp) + + # Verify that data_parallel_rank defaults to None + assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs + assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] is None diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 934ff78b2c710..bb770ecf03383 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -264,6 +264,9 @@ class OpenAIServingChat(OpenAIServing): if raw_request: raw_request.state.request_metadata = request_metadata + # Extract data_parallel_rank from header (router can inject it) + data_parallel_rank = self._get_data_parallel_rank(raw_request) + # Schedule the request and get the result generator. generators: list[AsyncGenerator[RequestOutput, None]] = [] try: @@ -331,6 +334,7 @@ class OpenAIServingChat(OpenAIServing): priority=request.priority, prompt_text=prompt_text, tokenization_kwargs=tokenization_kwargs, + data_parallel_rank=data_parallel_rank, ) generators.append(generator) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 62bc932f8b844..14dbdd4cb4c7c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -141,6 +141,9 @@ class OpenAIServingCompletion(OpenAIServing): logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) + # Extract data_parallel_rank from header (router can inject it) + data_parallel_rank = self._get_data_parallel_rank(raw_request) + # Schedule the request and get the result generator. generators: list[AsyncGenerator[RequestOutput, None]] = [] try: @@ -224,6 +227,7 @@ class OpenAIServingCompletion(OpenAIServing): priority=request.priority, prompt_text=prompt_text, tokenization_kwargs=tokenization_kwargs, + data_parallel_rank=data_parallel_rank, ) generators.append(generator) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index af5a423134fb0..c0750cd641667 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1298,6 +1298,21 @@ class OpenAIServing: return raw_request.headers.get("X-Request-Id", default) + @staticmethod + def _get_data_parallel_rank(raw_request: Request | None) -> int | None: + """Pulls the data parallel rank from a header, if provided""" + if raw_request is None: + return None + + rank_str = raw_request.headers.get("X-data-parallel-rank") + if rank_str is None: + return None + + try: + return int(rank_str) + except ValueError: + return None + @staticmethod def _get_decoded_token( logprob: Logprob, From 4917002523db90813a47ca5aed5cd22e2edb75f4 Mon Sep 17 00:00:00 2001 From: Sumanth R Hegde <39546518+SumanthRH@users.noreply.github.com> Date: Thu, 30 Oct 2025 12:26:27 -0700 Subject: [PATCH 034/976] [Fix] Skip `record_sleep_state` logic in `PrometheusStatsLogger` if not in dev mode (#27789) Signed-off-by: SumanthRH --- tests/basic_correctness/test_cumem.py | 43 ++++++++++++++++++++++++++- vllm/v1/metrics/loggers.py | 3 ++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 09f4ec03fbbb0..0c037622f5e82 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio + import pytest import torch -from vllm import LLM, SamplingParams +from vllm import LLM, AsyncEngineArgs, AsyncLLMEngine, SamplingParams from vllm.device_allocator.cumem import CuMemAllocator from vllm.utils.mem_constants import GiB_bytes @@ -201,3 +203,42 @@ def test_deep_sleep(): # cmp output assert output[0].outputs[0].text == output2[0].outputs[0].text + + +@create_new_process_for_each_test() +def test_deep_sleep_async(): + async def test(): + model = "hmellor/tiny-random-LlamaForCausalLM" + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + engine_args = AsyncEngineArgs( + model=model, + enable_sleep_mode=True, + ) + + llm = AsyncLLMEngine.from_engine_args(engine_args) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + outputs = llm.generate(prompt, sampling_params, request_id="test_request_id1") + async for output in outputs: + pass + + # Put the engine to deep sleep + await llm.sleep(level=2) + + await llm.wake_up(tags=["weights"]) + await llm.collective_rpc("reload_weights") + free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline + assert used_bytes < 4 * GiB_bytes + + # now allocate kv cache and cuda graph memory + await llm.wake_up(tags=["kv_cache"]) + outputs2 = llm.generate(prompt, sampling_params, request_id="test_request_id2") + async for output2 in outputs2: + pass + + # cmp output + assert output.outputs[0].text == output2.outputs[0].text + + asyncio.run(test()) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 3772f07066a12..67b6ceaa847f6 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1052,6 +1052,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time() def record_sleep_state(self, sleep: int = 0, level: int = 0): + if not envs.VLLM_SERVER_DEV_MODE: + return + awake = 1 discard_all = 0 weights_offloaded = 0 From a8141fa649d1296488cc5de2b479fed460bb34f4 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 30 Oct 2025 15:32:39 -0400 Subject: [PATCH 035/976] [Refactor] Remove `VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK` (#27750) Signed-off-by: yewentao256 --- vllm/distributed/device_communicators/all2all.py | 2 +- vllm/envs.py | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 013ef3c1f5c36..c40dde26b741f 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -363,7 +363,7 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): num_rdma_bytes=num_rdma_bytes, low_latency_mode=True, num_qps_per_rank=num_qps_per_rank, - allow_nvlink_for_low_latency_mode=envs.VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK, + allow_nvlink_for_low_latency_mode=True, allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL, ) diff --git a/vllm/envs.py b/vllm/envs.py index 0548f01fc8cdf..2744335ed3d38 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -207,7 +207,6 @@ if TYPE_CHECKING: VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024 VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE: bool = False - VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK: bool = True VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL: bool = False VLLM_DBO_COMM_SMS: int = 20 VLLM_PATTERN_MATCH_DEBUG: str | None = None @@ -1400,11 +1399,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool( int(os.getenv("VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", "0")) ), - # Allow DeepEP to use nvlink for internode_ll kernel, turn this on for - # better latency on GB200 like system - "VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK": lambda: bool( - int(os.getenv("VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK", "1")) - ), # Allow DeepEP to use MNNVL (multi-node nvlink) for internode_ll kernel, # turn this for better latency on GB200 like system "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL": lambda: bool( @@ -1566,7 +1560,6 @@ def compute_hash() -> str: "VLLM_NVFP4_GEMM_BACKEND", "VLLM_USE_FBGEMM", "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", - "VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK", "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", ] for key in environment_variables_to_hash: From 4b68c4a55b0fa5846d180532ae7e58db85101e07 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Thu, 30 Oct 2025 12:47:30 -0700 Subject: [PATCH 036/976] [Core][Perf] Only invoke save_new_computed_blocks when computed blocks are not empty (#27799) Signed-off-by: Jialin Ouyang --- vllm/v1/core/kv_cache_manager.py | 11 ++++++----- vllm/v1/core/single_type_kv_cache_manager.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index bb8cec91f36dd..63a1ff06e4049 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -306,11 +306,12 @@ class KVCacheManager: "Computed blocks should be empty when prefix caching is disabled" ) - # Append the new computed blocks to the request blocks until now to - # avoid the case where the new blocks cannot be allocated. - self.coordinator.save_new_computed_blocks( - request.request_id, new_computed_block_list - ) + if new_computed_block_list is not self.empty_kv_cache_blocks.blocks: + # Append the new computed blocks to the request blocks until now to + # avoid the case where the new blocks cannot be allocated. + self.coordinator.save_new_computed_blocks( + request.request_id, new_computed_block_list + ) new_blocks = self.coordinator.allocate_new_blocks( request.request_id, num_tokens_need_slot, num_encoder_tokens diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 575ae3d7d83b6..8f14fb1894707 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -151,7 +151,7 @@ class SingleTypeKVCacheManager(ABC): num_tokens: The total number of tokens that need to be cached (including tokens that are already cached). """ - num_cached_blocks = self.num_cached_block[request.request_id] + num_cached_blocks = self.num_cached_block.get(request.request_id, 0) num_full_blocks = num_tokens // self.block_size if num_cached_blocks >= num_full_blocks: From e7acb200766a0f8f006f9b6fd961dfdceabd7269 Mon Sep 17 00:00:00 2001 From: Paul Zhang Date: Thu, 30 Oct 2025 16:11:29 -0400 Subject: [PATCH 037/976] [Feature] Batch invariant torch.compile (#27660) Signed-off-by: PaulZhang12 Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- vllm/config/model.py | 7 -- vllm/envs.py | 8 ++- vllm/model_executor/layers/batch_invariant.py | 71 +++++++++++++++++++ .../model_executor/layers/quantization/fp8.py | 5 +- 4 files changed, 82 insertions(+), 9 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 092c67e7bed8c..082f90653f5af 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -20,9 +20,6 @@ from vllm.config.pooler import PoolerConfig from vllm.config.scheduler import RunnerType from vllm.config.utils import assert_hashable, config, getattr_iter from vllm.logger import init_logger -from vllm.model_executor.layers.batch_invariant import ( - vllm_is_batch_invariant, -) from vllm.platforms import current_platform from vllm.transformers_utils.config import ( ConfigFormat, @@ -436,10 +433,6 @@ class ModelConfig: skip_mm_profiling: bool | None, video_pruning_rate: float | None, ) -> None: - # Enable batch invariance settings if requested - if vllm_is_batch_invariant(): - self.enforce_eager = True - # Set the default seed to 0 in V1. # NOTE(woosuk): In V1, we use separate processes for workers (unless # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here diff --git a/vllm/envs.py b/vllm/envs.py index 2744335ed3d38..21237c70a45e4 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -251,6 +251,9 @@ def disable_compile_cache() -> bool: def use_aot_compile() -> bool: + from vllm.model_executor.layers.batch_invariant import ( + vllm_is_batch_invariant, + ) from vllm.utils.torch_utils import is_torch_equal_or_newer default_value = ( @@ -259,7 +262,10 @@ def use_aot_compile() -> bool: else "0" ) - return os.environ.get("VLLM_USE_AOT_COMPILE", default_value) == "1" + return ( + not vllm_is_batch_invariant() + and os.environ.get("VLLM_USE_AOT_COMPILE", default_value) == "1" + ) def env_with_choices( diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 5706786bccb1d..39e77b935d3d5 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -11,6 +11,7 @@ import torch import vllm.envs as envs from vllm.logger import init_logger from vllm.triton_utils import tl, triton +from vllm.utils.torch_utils import is_torch_equal_or_newer logger = init_logger(__name__) @@ -716,6 +717,10 @@ def linear_batch_invariant(input, weight, bias=None): _batch_invariant_MODE = False _batch_invariant_LIB = None _original_torch_bmm = None +_original_fp16_reduction_precision = None +_original_bf16_reduction_precision = None +_original_cublas_workspace_cfg = None +_original_cublaslt_workspace_size = None def is_batch_invariant_mode_enabled(): @@ -724,6 +729,8 @@ def is_batch_invariant_mode_enabled(): def enable_batch_invariant_mode(): global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm + global _original_fp16_reduction_precision, _original_bf16_reduction_precision + global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size if _batch_invariant_MODE: return @@ -745,14 +752,75 @@ def enable_batch_invariant_mode(): _original_torch_bmm = torch.bmm torch.bmm = bmm_batch_invariant + _original_bf16_reduction_precision = ( + torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction + ) + _original_fp16_reduction_precision = ( + torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction + ) + + reduced_precision_val = ( + (False, False) if is_torch_equal_or_newer("2.10.0.dev") else False + ) + torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = ( + reduced_precision_val + ) + torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = ( + reduced_precision_val + ) + torch.backends.cuda.preferred_blas_library(backend="cublaslt") + + if not is_torch_equal_or_newer("2.10.0.dev"): + _original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None) + _original_cublaslt_workspace_size = os.environ.get( + "CUBLASLT_WORKSPACE_SIZE", None + ) + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" + os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1" + def disable_batch_invariant_mode(): global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm + global _original_fp16_reduction_precision, _original_bf16_reduction_precision + global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size + if not _batch_invariant_MODE: + return + if _batch_invariant_LIB is not None: _batch_invariant_LIB._destroy() if _original_torch_bmm is not None: torch.bmm = _original_torch_bmm _original_torch_bmm = None + + if _original_bf16_reduction_precision is not None: + torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = ( + _original_bf16_reduction_precision + ) + _original_bf16_reduction_precision = None + if _original_fp16_reduction_precision is not None: + torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = ( + _original_fp16_reduction_precision + ) + _original_fp16_reduction_precision = None + + torch.backends.cuda.preferred_blas_library(backend="default") + + if not is_torch_equal_or_newer("2.10.0.dev"): + # Set cublas env vars to previous results. If previous results are None, + # that means the env vars were not set, so we should remove them. + if _original_cublas_workspace_cfg: + os.environ["CUBLAS_WORKSPACE_CONFIG"] = _original_cublas_workspace_cfg + elif "CUBLAS_WORKSPACE_CONFIG" in os.environ: + del os.environ["CUBLAS_WORKSPACE_CONFIG"] + + if _original_cublaslt_workspace_size: + os.environ["CUBLASLT_WORKSPACE_SIZE"] = _original_cublaslt_workspace_size + elif "CUBLASLT_WORKSPACE_SIZE" in os.environ: + del os.environ["CUBLASLT_WORKSPACE_SIZE"] + + _original_cublas_workspace_cfg = None + _original_cublaslt_workspace_size = None + _batch_invariant_MODE = False _batch_invariant_LIB = None @@ -831,6 +899,9 @@ def override_envs_for_invariance(): os.environ["NCCL_NTHREADS"] = "1" os.environ["NCCL_SOCKET_NTHREADS"] = "1" + # torch.compile settings + os.environ["VLLM_USE_AOT_COMPILE"] = "0" + def init_batch_invariance(): # this will hit all the csrc overrides as well diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e5681cb856258..f82eccb88ce09 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -363,6 +363,7 @@ class Fp8LinearMethod(LinearMethodBase): self.use_marlin = False self.use_aiter_and_is_supported = check_aiter_fp8_linear_support() + self.use_deep_gemm = is_deep_gemm_supported() self.weight_block_size = self.quant_config.weight_block_size self.block_quant = self.weight_block_size is not None @@ -545,8 +546,10 @@ class Fp8LinearMethod(LinearMethodBase): # if batch invariant mode is enabled, prefer DeepGEMM FP8 path # we will use BF16 dequant when DeepGEMM is not supported. if vllm_is_batch_invariant(): + # Call is_deep_gemm_supported() ahead of time for torch.compile + # dynamo has trouble tracing through if self.block_quant and should_use_deepgemm_for_fp8_linear( - torch.bfloat16, layer.weight, None + torch.bfloat16, layer.weight, self.use_deep_gemm ): # use group quant consistent with block size across K assert self.act_q_group_shape is not None From c9791f18138d1a11bfe68550b10673b493ec9330 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 30 Oct 2025 16:26:13 -0700 Subject: [PATCH 038/976] [BugFix] Fix broken import in initialize_ray_cluster() (#27838) Signed-off-by: Nick Hill --- vllm/v1/executor/ray_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py index 382f008266e62..9385e55b066f8 100644 --- a/vllm/v1/executor/ray_utils.py +++ b/vllm/v1/executor/ray_utils.py @@ -322,7 +322,7 @@ def initialize_ray_cluster( # Prevalidate GPU requirements before Ray processing if current_platform.is_cuda() and parallel_config.world_size > 1: - from vllm.utils import cuda_device_count_stateless + from vllm.utils.torch_utils import cuda_device_count_stateless available_gpus = cuda_device_count_stateless() if parallel_config.world_size > available_gpus: From d5d2a0fe7480fa23348ec253cb5c80901d27f952 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 30 Oct 2025 19:46:02 -0400 Subject: [PATCH 039/976] [Misc] Make all tool scripts executable (#27831) Signed-off-by: Matthew Bonanni --- tools/check_repo.sh | 0 tools/ep_kernels/configure_system_drivers.sh | 0 tools/ep_kernels/elastic_ep/install_eep_libraries.sh | 0 tools/ep_kernels/install_python_libraries.sh | 1 + tools/flashinfer-build.sh | 0 tools/vllm-tpu/build.sh | 0 6 files changed, 1 insertion(+) mode change 100644 => 100755 tools/check_repo.sh mode change 100644 => 100755 tools/ep_kernels/configure_system_drivers.sh mode change 100644 => 100755 tools/ep_kernels/elastic_ep/install_eep_libraries.sh mode change 100644 => 100755 tools/ep_kernels/install_python_libraries.sh mode change 100644 => 100755 tools/flashinfer-build.sh mode change 100644 => 100755 tools/vllm-tpu/build.sh diff --git a/tools/check_repo.sh b/tools/check_repo.sh old mode 100644 new mode 100755 diff --git a/tools/ep_kernels/configure_system_drivers.sh b/tools/ep_kernels/configure_system_drivers.sh old mode 100644 new mode 100755 diff --git a/tools/ep_kernels/elastic_ep/install_eep_libraries.sh b/tools/ep_kernels/elastic_ep/install_eep_libraries.sh old mode 100644 new mode 100755 diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh old mode 100644 new mode 100755 index c2d8d1ed9e3d5..5ea543f4cb1e8 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -1,3 +1,4 @@ +#!/usr/bin/env bash set -ex # prepare workspace directory diff --git a/tools/flashinfer-build.sh b/tools/flashinfer-build.sh old mode 100644 new mode 100755 diff --git a/tools/vllm-tpu/build.sh b/tools/vllm-tpu/build.sh old mode 100644 new mode 100755 From 697f507a8ebb13d74e8c0695aec05d9baefb45a0 Mon Sep 17 00:00:00 2001 From: Jakub Sochacki <97886316+jakub-sochacki@users.noreply.github.com> Date: Fri, 31 Oct 2025 00:57:22 +0100 Subject: [PATCH 040/976] [CI/Build][Intel] Enable performance benchmarks for Intel Gaudi 3 (#26919) Signed-off-by: jakub-sochacki --- .buildkite/performance-benchmarks/README.md | 3 +- .../performance-benchmarks-descriptions.md | 6 +- .../scripts/run-performance-benchmarks.sh | 13 +++ .../tests/latency-tests-hpu.json | 55 +++++++++++++ .../tests/serving-tests-hpu.json | 82 +++++++++++++++++++ .../tests/throughput-tests-hpu.json | 61 ++++++++++++++ 6 files changed, 216 insertions(+), 4 deletions(-) create mode 100644 .buildkite/performance-benchmarks/tests/latency-tests-hpu.json create mode 100644 .buildkite/performance-benchmarks/tests/serving-tests-hpu.json create mode 100644 .buildkite/performance-benchmarks/tests/throughput-tests-hpu.json diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index 332142ba5d170..6d494f64f14fa 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models. **Benchmarking Duration**: about 1hr. @@ -34,6 +34,7 @@ Runtime environment variables: See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. +For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. > ### Latency test diff --git a/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md index 8bb16bd3cf373..b9437ac5ca99a 100644 --- a/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md +++ b/.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md @@ -5,7 +5,7 @@ - Input length: 32 tokens. - Output length: 128 tokens. - Batch size: fixed (8). -- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - CPU Models: llama-3.1 8B. - Evaluation metrics: end-to-end latency (mean, median, p99). @@ -16,7 +16,7 @@ - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm to achieve maximum throughput. -- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - CPU Models: llama-3.1 8B. - Evaluation metrics: throughput. @@ -28,7 +28,7 @@ - Output length: the corresponding output length of these 200 prompts. - Batch size: dynamically determined by vllm and the arrival pattern of the requests. - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). -- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. +- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2 - CPU Models: llama-3.1 8B. - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh index 9447ceffd7e22..99a5a5e334f8e 100644 --- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -15,6 +15,8 @@ check_gpus() { declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) elif command -v amd-smi; then declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l) + elif command -v hl-smi; then + declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l) fi if [[ $gpu_count -gt 0 ]]; then @@ -23,10 +25,16 @@ check_gpus() { echo "Need at least 1 GPU to run benchmarking." exit 1 fi + + declare -g arch_suffix='' + if command -v nvidia-smi; then declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') elif command -v amd-smi; then declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}') + elif command -v hl-smi; then + declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//') + arch_suffix='-hpu' fi echo "GPU type is $gpu_type" } @@ -138,6 +146,10 @@ kill_gpu_processes() { while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do sleep 1 done + elif command -v hl-smi; then + while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do + sleep 1 + done fi # remove vllm config file @@ -451,6 +463,7 @@ main() { ARCH='-cpu' else check_gpus + ARCH="$arch_suffix" fi check_hf_token diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json new file mode 100644 index 0000000000000..296380f72a668 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/latency-tests-hpu.json @@ -0,0 +1,55 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + }, + { + "test_name": "latency_llama70B_tp4", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + }, + { + "test_name": "latency_mixtral8x7B_tp2", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 256, + "async-scheduling": "" + } + } +] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json new file mode 100644 index 0000000000000..8c6b34bd9fa33 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-hpu.json @@ -0,0 +1,82 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 256, + "async-scheduling": "" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json new file mode 100644 index 0000000000000..3127bf2f6bce3 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json @@ -0,0 +1,61 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_llama70B_tp4", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_mixtral8x7B_tp2", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 512, + "async-scheduling": "" + } + } +] From 2bf0bcc1fca422222b78a3b1f39845ecd037aecc Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 30 Oct 2025 20:29:26 -0400 Subject: [PATCH 041/976] [CI Test] Add Scheduled Integration Test (#27765) Signed-off-by: yewentao256 --- .../deepseek_v2_lite_ep_eplb.sh | 62 +++++++++++++++++++ .../qwen30b_a3b_fp8_block_ep.sh | 61 ++++++++++++++++++ .buildkite/test-pipeline.yaml | 18 ++++++ 3 files changed, 141 insertions(+) create mode 100644 .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh create mode 100644 .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh new file mode 100644 index 0000000000000..5302f524a0ae4 --- /dev/null +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +set -euxo pipefail + +# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] +THRESHOLD=${1:-0.25} +NUM_Q=${2:-1319} +PORT=${3:-8010} +OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} +mkdir -p "${OUT_DIR}" + +wait_for_server() { + local port=$1 + timeout 600 bash -c ' + until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do + sleep 1 + done' +} + +MODEL="deepseek-ai/DeepSeek-V2-lite" +BACKENDS=("deepep_high_throughput" "deepep_low_latency") + +cleanup() { + if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then + kill "${SERVER_PID}" 2>/dev/null || true + for _ in {1..20}; do + kill -0 "${SERVER_PID}" 2>/dev/null || break + sleep 0.5 + done + kill -9 "${SERVER_PID}" 2>/dev/null || true + fi +} +trap cleanup EXIT + +for BACK in "${BACKENDS[@]}"; do + VLLM_DEEP_GEMM_WARMUP=skip \ + VLLM_ALL2ALL_BACKEND=$BACK \ + vllm serve "$MODEL" \ + --enforce-eager \ + --tensor-parallel-size 2 \ + --data-parallel-size 2 \ + --enable-expert-parallel \ + --enable-eplb \ + --trust-remote-code \ + --max-model-len 2048 \ + --port $PORT & + SERVER_PID=$! + wait_for_server $PORT + + TAG=$(echo "$MODEL" | tr '/: \\n' '_____') + OUT="${OUT_DIR}/${TAG}_${BACK}.json" + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} + python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" +PY + + cleanup + SERVER_PID= + sleep 1 + PORT=$((PORT+1)) +done diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh new file mode 100644 index 0000000000000..a5135299297e2 --- /dev/null +++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +set -euxo pipefail + +# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] +THRESHOLD=${1:-0.8} +NUM_Q=${2:-1319} +PORT=${3:-8020} +OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} +mkdir -p "${OUT_DIR}" + +wait_for_server() { + local port=$1 + timeout 600 bash -c ' + until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do + sleep 1 + done' +} + +MODEL="QWen/Qwen3-30B-A3B-FP8" +BACKENDS=("deepep_high_throughput" "deepep_low_latency") + +cleanup() { + if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then + kill "${SERVER_PID}" 2>/dev/null || true + for _ in {1..20}; do + kill -0 "${SERVER_PID}" 2>/dev/null || break + sleep 0.5 + done + kill -9 "${SERVER_PID}" 2>/dev/null || true + fi +} +trap cleanup EXIT + +for BACK in "${BACKENDS[@]}"; do + VLLM_DEEP_GEMM_WARMUP=skip \ + VLLM_ALL2ALL_BACKEND=$BACK \ + vllm serve "$MODEL" \ + --enforce-eager \ + --tensor-parallel-size 2 \ + --data-parallel-size 2 \ + --enable-expert-parallel \ + --trust-remote-code \ + --max-model-len 2048 \ + --port $PORT & + SERVER_PID=$! + wait_for_server $PORT + + TAG=$(echo "$MODEL" | tr '/: \\n' '_____') + OUT="${OUT_DIR}/${TAG}_${BACK}.json" + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} + python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" +PY + + cleanup + SERVER_PID= + sleep 1 + PORT=$((PORT+1)) +done diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 339e3aab6c031..8d4e5ece94d19 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1234,3 +1234,21 @@ steps: - .buildkite/scripts/run-prime-rl-test.sh commands: - bash .buildkite/scripts/run-prime-rl-test.sh + +- label: DeepSeek V2-Lite Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-30B-A3B-FP8-block Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020 From b2e65cb4a7ea7c000517a7b78a6e0ccd9ecb0517 Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Thu, 30 Oct 2025 19:40:35 -0500 Subject: [PATCH 042/976] [benchmark] Make request IDs unique across clients by default (#27723) Signed-off-by: Seiji Eicher --- vllm/benchmarks/serve.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 71d136d61ceaf..4b15d8e62913c 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -26,6 +26,7 @@ import os import random import shutil import time +import uuid import warnings from collections.abc import AsyncGenerator, Iterable from dataclasses import dataclass @@ -1160,7 +1161,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "--request-id-prefix", type=str, required=False, - default="benchmark-serving", + default=f"bench-{uuid.uuid4().hex[:8]}-", help="Specify the prefix of request id.", ) From 36960501d336a15cf0de7569e2662793ad9a4f3f Mon Sep 17 00:00:00 2001 From: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com> Date: Fri, 31 Oct 2025 13:15:26 +0530 Subject: [PATCH 043/976] [Hardware][Powerpc] Fix VLLM_CPU_OMP_THREADS_BIND="auto" low CPU utilization for Power (#27734) Signed-off-by: Akash Kaothalkar Co-authored-by: Akash Kaothalkar --- vllm/platforms/cpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 8c1d46564f6f6..4b9f4aef022d0 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -316,7 +316,8 @@ class CpuPlatform(Platform): if ( platform.system() == "Linux" - and Platform.get_cpu_architecture() == CpuArchEnum.ARM + and Platform.get_cpu_architecture() + in (CpuArchEnum.ARM, CpuArchEnum.POWERPC) and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str) ): # We need to LD_PRELOAD PyTorch's libgomp, otherwise only From e5ef4dfc11abfc44494963b85ced1c79d1d5efea Mon Sep 17 00:00:00 2001 From: toncao <130689535+toncao@users.noreply.github.com> Date: Fri, 31 Oct 2025 16:36:37 +0700 Subject: [PATCH 044/976] [Kimi-Linear] Correct prefixes and add compatibility to AWQ quants (#27834) Signed-off-by: toncao Co-authored-by: toncao --- vllm/model_executor/models/kimi_linear.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index a60a8d764d9d1..f8df72b067dd0 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -155,6 +155,7 @@ class KimiMoE(nn.Module): hidden_act=config.hidden_act, quant_config=quant_config, reduce_results=False, + prefix=f"{prefix}.shared_experts", ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -340,7 +341,7 @@ class KimiDecoderLayer(nn.Module): self.block_sparse_moe = KimiMoE( config=config, quant_config=quant_config, - prefix=f"{prefix}.mlp", + prefix=f"{prefix}.block_sparse_moe", ) self.mlp = self.block_sparse_moe else: From 3933f18a5e7b69b096d4b8f700dfa496e6716d86 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 31 Oct 2025 19:33:12 +0800 Subject: [PATCH 045/976] [Bugfix] Avoid too small block m/n for FlexAttention kernel option (#27853) Signed-off-by: Isotr0py --- vllm/v1/attention/backends/flex_attention.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index c16a77c093cfb..928252636d583 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -896,6 +896,8 @@ def get_kernel_options( return kernel_options else: preferred_block = 32 if query.dtype == torch.float32 else 64 + block_lower_bound = 16 + block_m_candidate = ensure_divisible(preferred_block, block_m) block_n_candidate = ensure_divisible(preferred_block, block_n) @@ -910,6 +912,9 @@ def get_kernel_options( max(1, block_n_candidate // 2), block_n ) + block_m_candidate = max(block_m_candidate, block_lower_bound) + block_n_candidate = max(block_n_candidate, block_lower_bound) + kernel_options["BLOCK_M"] = block_m_candidate kernel_options["BLOCK_N"] = block_n_candidate From 933cdea44061cb19a99421d2d2e51535e7f21216 Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Fri, 31 Oct 2025 04:36:18 -0700 Subject: [PATCH 046/976] =?UTF-8?q?[BugFix]=20Don=E2=80=99t=20compute=20re?= =?UTF-8?q?order=20threshold=20when=20there=20are=20no=20attention=20group?= =?UTF-8?q?s=20(#27861)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm/v1/worker/gpu_model_runner.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 04814b5991ebc..747a7b377e401 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4149,6 +4149,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): group.get_metadata_builder().reorder_batch_threshold for group in self._attn_group_iterator() ] + # If there are no attention groups (attention-free model) or no backend + # reports a threshold, leave reordering disabled. + if len(reorder_batch_thresholds) == 0: + self.reorder_batch_threshold = None + return self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds) def _find_compatible_block_sizes( From 3857eb87257cf23d2401a651884dcbbf42c16b7b Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Fri, 31 Oct 2025 21:35:52 +0800 Subject: [PATCH 047/976] [Perf] Decouple torch op from GDA to leverage torch.compile (#27871) Signed-off-by: zjy0516 --- vllm/model_executor/layers/kda.py | 116 +++++++++++++++++------------- 1 file changed, 68 insertions(+), 48 deletions(-) diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py index c45e7546fac1e..308bc8be1dece 100644 --- a/vllm/model_executor/layers/kda.py +++ b/vllm/model_executor/layers/kda.py @@ -40,18 +40,36 @@ logger = init_logger(__name__) def kda_attention( - hidden_states: torch.Tensor, - output: torch.Tensor, + q_proj_states: torch.Tensor, + k_proj_states: torch.Tensor, + v_proj_states: torch.Tensor, + g1: torch.Tensor, + g2: torch.Tensor, + beta: torch.Tensor, + core_attn_out: torch.Tensor, layer_name: str, ) -> None: forward_context: ForwardContext = get_forward_context() self = forward_context.no_compile_layers[layer_name] - self._forward(hidden_states=hidden_states, output=output) + self._forward( + q_proj_states=q_proj_states, + k_proj_states=k_proj_states, + v_proj_states=v_proj_states, + g1=g1, + g2=g2, + beta=beta, + core_attn_out=core_attn_out, + ) def kda_attention_fake( - hidden_states: torch.Tensor, - output: torch.Tensor, + q_proj_states: torch.Tensor, + k_proj_states: torch.Tensor, + v_proj_states: torch.Tensor, + g1: torch.Tensor, + g2: torch.Tensor, + beta: torch.Tensor, + core_attn_out: torch.Tensor, layer_name: str, ) -> None: return @@ -60,7 +78,7 @@ def kda_attention_fake( direct_register_custom_op( op_name="kda_attention", op_func=kda_attention, - mutates_args=["output"], + mutates_args=["core_attn_out"], fake_impl=kda_attention_fake, ) @@ -241,37 +259,56 @@ class KimiDeltaAttention(nn.Module, MambaBase): hidden_states: torch.Tensor, positions: torch.Tensor, output: torch.Tensor, - ) -> None: - return torch.ops.vllm.kda_attention( - hidden_states, - output, + ) -> torch.Tensor: + num_tokens = hidden_states.size(0) + q = self.q_proj(hidden_states)[0] + k = self.k_proj(hidden_states)[0] + v = self.v_proj(hidden_states)[0] + + beta = self.b_proj(hidden_states)[0].float().sigmoid() + g1 = self.f_b_proj(self.f_a_proj(hidden_states)[0])[0] + g1 = fused_kda_gate(g1, self.A_log, self.head_dim, g_bias=self.dt_bias) + beta = beta.unsqueeze(0) + g1 = g1.unsqueeze(0) + + g_proj_states = self.g_b_proj(self.g_a_proj(hidden_states)[0])[0] + g2 = rearrange(g_proj_states, "... (h d) -> ... h d", d=self.head_dim) + + core_attn_out = torch.zeros( + (1, num_tokens, self.local_num_heads, self.head_dim), + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + torch.ops.vllm.kda_attention( + q, + k, + v, + g1, + g2, + beta, + core_attn_out, self.prefix, ) + core_attn_out = self.o_norm(core_attn_out, g2) + core_attn_out = rearrange(core_attn_out, "1 n h d -> n (h d)") + + return self.o_proj(core_attn_out)[0] def _forward( self, - hidden_states: torch.Tensor, - output: torch.Tensor, + q_proj_states: torch.Tensor, + k_proj_states: torch.Tensor, + v_proj_states: torch.Tensor, + g1: torch.Tensor, + g2: torch.Tensor, + beta: torch.Tensor, + core_attn_out: torch.Tensor, ) -> None: forward_context = get_forward_context() attn_metadata: AttentionMetadata = forward_context.attn_metadata if attn_metadata is None: - # V1 profile run - # Mimic the memory allocation in the real run - q = torch.empty_like(hidden_states) - k = torch.empty_like(hidden_states) - v = torch.empty_like(hidden_states) - g = hidden_states.new_empty( - hidden_states.size(0), - self.local_num_heads, - self.head_dim, - dtype=torch.float32, - ) - beta = torch.empty( - hidden_states.size(0), self.local_num_heads, dtype=torch.float32 - ) - core_attn_out = torch.empty_like(hidden_states) + # # V1 profile run return assert isinstance(attn_metadata, dict) @@ -288,10 +325,6 @@ class KimiDeltaAttention(nn.Module, MambaBase): conv_state_k = conv_state_k.transpose(-1, -2) conv_state_v = conv_state_v.transpose(-1, -2) - q_proj_states = self.q_proj(hidden_states)[0] - k_proj_states = self.k_proj(hidden_states)[0] - v_proj_states = self.v_proj(hidden_states)[0] - q_conv_weights = self.q_conv1d.weight.view( self.q_conv1d.weight.size(0), self.q_conv1d.weight.size(2) ) @@ -374,14 +407,6 @@ class KimiDeltaAttention(nn.Module, MambaBase): lambda x: rearrange(x, "n (h d) -> 1 n h d", d=self.head_dim), (q, k, v) ) - beta = self.b_proj(hidden_states)[0].float().sigmoid() - - g = self.f_b_proj(self.f_a_proj(hidden_states)[0])[0] - g = fused_kda_gate(g, self.A_log, self.head_dim, g_bias=self.dt_bias) - - beta = beta.unsqueeze(0) - g = g.unsqueeze(0) - if attn_metadata.num_prefills > 0: zero_idx = non_spec_state_indices_tensor[~has_initial_state] recurrent_state[zero_idx] = 0 @@ -393,7 +418,7 @@ class KimiDeltaAttention(nn.Module, MambaBase): q=q, k=k, v=v, - g=g, + g=g1, beta=beta, initial_state=initial_state, output_final_state=True, @@ -410,17 +435,12 @@ class KimiDeltaAttention(nn.Module, MambaBase): q=q, k=k, v=v, - g=g, + g=g1, beta=beta, initial_state=recurrent_state, use_qk_l2norm_in_kernel=True, cu_seqlens=non_spec_query_start_loc, ssm_state_indices=non_spec_state_indices_tensor, ) - - g_proj_states = self.g_b_proj(self.g_a_proj(hidden_states)[0])[0] - g = rearrange(g_proj_states, "... (h d) -> ... h d", d=self.head_dim) - core_attn_out = self.o_norm(core_attn_out_non_spec, g) - core_attn_out = rearrange(core_attn_out, "1 n h d -> n (h d)") - - output[:] = self.o_proj(core_attn_out)[0] + assert core_attn_out_non_spec.shape == core_attn_out.shape + core_attn_out[:] = core_attn_out_non_spec From 0384aa7150c4c9778efca041ffd1beb3ad2bd694 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 31 Oct 2025 22:17:21 +0800 Subject: [PATCH 048/976] [CI/Build] Add gpt-oss LoRA test (#27870) Signed-off-by: Jee Jee Li --- .buildkite/test-amd.yaml | 4 +- .buildkite/test-pipeline.yaml | 3 +- tests/lora/conftest.py | 2 +- tests/lora/test_deepseekv2_tp.py | 4 ++ tests/lora/test_gptoss.py | 52 --------------- tests/lora/test_gptoss_tp.py | 106 +++++++++++++++++++++++++++++++ tests/lora/test_qwen3moe_tp.py | 4 ++ 7 files changed, 120 insertions(+), 55 deletions(-) delete mode 100644 tests/lora/test_gptoss.py create mode 100644 tests/lora/test_gptoss_tp.py diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 35bd4c99adb78..c023457fb03e4 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -441,7 +441,7 @@ steps: --ignore=lora/test_llm_with_multi_loras.py \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ - --ignore=lora/test_gptoss.py \ + --ignore=lora/test_gptoss_tp.py \ --ignore=lora/test_qwen3moe_tp.py parallelism: 4 @@ -1217,6 +1217,8 @@ steps: - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py + - label: Weight Loading Multiple GPU Test # 33min timeout_in_minutes: 45 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8d4e5ece94d19..3bd5bd87fe6f0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -417,7 +417,7 @@ steps: --ignore=lora/test_llm_with_multi_loras.py \ --ignore=lora/test_olmoe_tp.py \ --ignore=lora/test_deepseekv2_tp.py \ - --ignore=lora/test_gptoss.py \ + --ignore=lora/test_gptoss_tp.py \ --ignore=lora/test_qwen3moe_tp.py parallelism: 4 @@ -1119,6 +1119,7 @@ steps: - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 2a688216f25ec..d8ff9339bb49b 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -237,7 +237,7 @@ def deepseekv2_lora_files(): @pytest.fixture(scope="session") def gptoss20b_lora_files(): - return snapshot_download(repo_id="LevinZheng/gpt-oss-20b-lora-adapter") + return snapshot_download(repo_id="jeeejeee/gpt-oss-20b-lora-adapter-text2sql") @pytest.fixture(scope="session") diff --git a/tests/lora/test_deepseekv2_tp.py b/tests/lora/test_deepseekv2_tp.py index 98b7e6333f300..b3496fa88e6bb 100644 --- a/tests/lora/test_deepseekv2_tp.py +++ b/tests/lora/test_deepseekv2_tp.py @@ -1,6 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# NOTE To avoid overloading the CI pipeline, this test script will +# not be triggered on CI and is primarily intended for local testing +# and verification. + import vllm from vllm.lora.request import LoRARequest diff --git a/tests/lora/test_gptoss.py b/tests/lora/test_gptoss.py deleted file mode 100644 index f5c9a5cf20e01..0000000000000 --- a/tests/lora/test_gptoss.py +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import vllm -from vllm.lora.request import LoRARequest - -MODEL_PATH = "openai/gpt-oss-20b" - -PROMPT_TEMPLATE = "<|begin▁of▁sentence|>You are a helpful assistant.\n\nUser: {context}\n\nAssistant:" # noqa: E501 - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: - prompts = [ - PROMPT_TEMPLATE.format(context="Who are you?"), - ] - sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None, - ) - # Print the outputs. - generated_texts: list[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text.strip() - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -# FIXME: Load gpt-oss adapter -def test_gptoss20b_lora(gptoss20b_lora_files): - # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI, - # Otherwise, the lora-test will fail due to CUDA OOM. - llm = vllm.LLM( - MODEL_PATH, - enable_lora=True, - max_loras=4, - trust_remote_code=True, - ) - - expected_lora_output = [ - "I am an AI language model developed by OpenAI. " - "I am here to help you with any questions or " - "tasks you may have." - ] - - output1 = do_sample(llm, gptoss20b_lora_files, lora_id=1) - print(output1) - for i in range(len(expected_lora_output)): - assert output1[i].startswith(expected_lora_output[i]) diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py new file mode 100644 index 0000000000000..db4b7ca5ef499 --- /dev/null +++ b/tests/lora/test_gptoss_tp.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import vllm +from vllm.lora.request import LoRARequest + +from ..utils import multi_gpu_test + +MODEL_PATH = "openai/gpt-oss-20b" + +PROMPT_TEMPLATE = """<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI. +Knowledge cutoff: 2024-06 +Current date: 2025-10-29 + +Reasoning: medium + +# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>user<|message|>I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. +" +##Instruction: +farm contains tables such as city, farm, farm_competition, competition_record. Table city has columns such as City_ID, Official_Name, Status, Area_km_2, Population, Census_Ranking. City_ID is the primary key. +Table farm has columns such as Farm_ID, Year, Total_Horses, Working_Horses, Total_Cattle, Oxen, Bulls, Cows, Pigs, Sheep_and_Goats. Farm_ID is the primary key. +Table farm_competition has columns such as Competition_ID, Year, Theme, Host_city_ID, Hosts. Competition_ID is the primary key. +Table competition_record has columns such as Competition_ID, Farm_ID, Rank. Competition_ID is the primary key. +The Host_city_ID of farm_competition is the foreign key of City_ID of city. +The Farm_ID of competition_record is the foreign key of Farm_ID of farm. +The Competition_ID of competition_record is the foreign key of Competition_ID of farm_competition. + + +###Input: +{context} + +###Response:<|end|><|start|>assistant<|channel|>final<|message|>""" # noqa: E501 + +EXPECTED_LORA_OUTPUT = [ + "SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;", + "SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;", + "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;", + "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;", +] + + +def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None: + prompts = [ + PROMPT_TEMPLATE.format( + context="What is the average number of working horses of farms with more than 5000 total number of horses?" # noqa: E501 + ), # noqa: E501 + PROMPT_TEMPLATE.format( + context="Give the average number of working horses on farms with more than 5000 total horses." # noqa: E501 + ), # noqa: E501 + PROMPT_TEMPLATE.format( + context="What are the maximum and minimum number of cows across all farms." + ), + PROMPT_TEMPLATE.format( + context="Return the maximum and minimum number of cows across all farms." + ), + ] + sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None, + ) + # Print the outputs. + generated_texts: list[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text.strip() + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i]) + + +def test_gpt_oss_lora(gptoss20b_lora_files): + llm = vllm.LLM( + MODEL_PATH, + max_model_len=1024, + enable_lora=True, + max_loras=4, + max_lora_rank=8, + compilation_config=vllm.config.CompilationConfig( # Avoid OOM + cudagraph_specialize_lora=False, + ), + ) + + generate_and_test(llm, gptoss20b_lora_files, lora_id=1) + generate_and_test(llm, gptoss20b_lora_files, lora_id=2) + + +@multi_gpu_test(num_gpus=2) +def test_gpt_oss_lora_tp2(gptoss20b_lora_files): + llm = vllm.LLM( + MODEL_PATH, + max_model_len=1024, + enable_lora=True, + max_loras=2, + max_lora_rank=8, + tensor_parallel_size=2, + compilation_config=vllm.config.CompilationConfig( # Avoid OOM + cudagraph_specialize_lora=False, + ), + ) + + generate_and_test(llm, gptoss20b_lora_files, lora_id=1) + generate_and_test(llm, gptoss20b_lora_files, lora_id=2) diff --git a/tests/lora/test_qwen3moe_tp.py b/tests/lora/test_qwen3moe_tp.py index de2b040907f98..fcac4275cc40e 100644 --- a/tests/lora/test_qwen3moe_tp.py +++ b/tests/lora/test_qwen3moe_tp.py @@ -1,6 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# NOTE To avoid overloading the CI pipeline, this test script will not +# be triggered on CI and is primarily intended for local testing and verification. + import vllm from vllm.lora.request import LoRARequest From 675704ac01e8ab1b986f777f7ccc4ac72159eb7b Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Fri, 31 Oct 2025 17:58:42 +0100 Subject: [PATCH 049/976] [Bugfix] Allow 64-bit integer values for LoRA IDs to avoid overflow/truncation (#27876) Signed-off-by: Madeesh Kannan --- vllm/v1/worker/gpu_input_batch.py | 2 +- vllm/v1/worker/tpu_input_batch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index bc7578cbd97cd..fe834db115e70 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -204,7 +204,7 @@ class InputBatch: self.num_accepted_tokens_cpu = self.num_accepted_tokens_cpu_tensor.numpy() # lora related - self.request_lora_mapping = np.zeros((self.max_num_reqs,), dtype=np.int32) + self.request_lora_mapping = np.zeros((self.max_num_reqs,), dtype=np.int64) self.lora_id_to_request_ids: dict[int, set[str]] = {} self.lora_id_to_lora_request: dict[int, LoRARequest] = {} diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py index 74e8225b2f4b8..d3fb17054c1a7 100644 --- a/vllm/v1/worker/tpu_input_batch.py +++ b/vllm/v1/worker/tpu_input_batch.py @@ -139,7 +139,7 @@ class InputBatch: self.min_tokens: dict[int, tuple[int, set[int]]] = {} # lora related - self.request_lora_mapping = np.zeros((self.max_num_reqs,), dtype=np.int32) + self.request_lora_mapping = np.zeros((self.max_num_reqs,), dtype=np.int64) self.lora_id_to_request_ids: dict[int, set[str]] = {} self.lora_id_to_lora_request: dict[int, LoRARequest] = {} From 7e06c40e63c12c0ea5fb400fa8f06007e90ff84f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 1 Nov 2025 01:04:51 +0800 Subject: [PATCH 050/976] [Bugfix] Fix broken MRoPE for GLM-4.1V/GLM-4.5V (#27860) Signed-off-by: Isotr0py --- vllm/model_executor/models/glm4_1v.py | 149 +++++++++++++++++++++++++- 1 file changed, 147 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 9f1439e21ef79..3e243385fd049 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -26,6 +26,7 @@ # limitations under the License. """Inference-only GLM-4V model compatible with HuggingFace weights.""" +import itertools import math from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial @@ -36,7 +37,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from transformers import BatchFeature +from transformers import BatchFeature, PretrainedConfig from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig from transformers.models.glm4v.image_processing_glm4v import ( Glm4vImageProcessor, @@ -89,6 +90,7 @@ from ..layers.activation import SiluAndMul from .interfaces import ( MultiModalEmbeddings, SupportsLoRA, + SupportsMRoPE, SupportsMultiModal, SupportsPP, ) @@ -1386,7 +1388,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): dummy_inputs=Glm4vDummyInputsBuilder, ) class Glm4vForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP + nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): merge_by_field_config = True @@ -1613,6 +1615,149 @@ class Glm4vForConditionalGeneration( multimodal_embeddings += tuple(video_embeddings) return multimodal_embeddings + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: "PretrainedConfig", + image_grid_thw: list[list[int]] | torch.Tensor | None, + video_grid_thw: list[list[int]] | torch.Tensor | None, + second_per_grid_ts: list[float] | None = None, + context_len: int = 0, + seq_len: int | None = None, + audio_feature_lengths: torch.Tensor | None = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value for GLM4V.""" + + image_token_id = hf_config.image_token_id + video_start_token_id = hf_config.video_start_token_id + video_end_token_id = hf_config.video_end_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + llm_pos_ids_list: list = [] + + if not (image_grid_thw is None and video_grid_thw is None): + if isinstance(image_grid_thw, torch.Tensor): + image_grid_thw = image_grid_thw.tolist() + + input_token_type: list[str] = [] + video_check_flg = False + for token in input_tokens: + if token == video_start_token_id: + video_check_flg = True + elif token == video_end_token_id: + video_check_flg = False + + if (token == image_token_id) and (video_check_flg is False): + input_token_type.append("image") + elif (token == image_token_id) and (video_check_flg is True): + input_token_type.append("video") + else: + input_token_type.append("text") + + input_type_group: list[tuple[str, int, int]] = [] + for key, group_iter in itertools.groupby( + enumerate(input_token_type), lambda x: x[1] + ): + group_list = list(group_iter) + start_index = group_list[0][0] + end_index = group_list[-1][0] + 1 + input_type_group.append((key, start_index, end_index)) + + video_frame_num = 1 + mm_data_idx = 0 + for modality_type, start_idx, end_idx in input_type_group: + st_idx = ( + llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + ) + if modality_type == "image": + t, h, w = ( + image_grid_thw[mm_data_idx][0], + image_grid_thw[mm_data_idx][1], + image_grid_thw[mm_data_idx][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + + t_index = ( + torch.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .flatten() + ) + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + st_idx + ) + mm_data_idx += 1 + + elif modality_type == "video": + t, h, w = ( + video_frame_num, + image_grid_thw[mm_data_idx][1], + image_grid_thw[mm_data_idx][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + + for t_idx in range(llm_grid_t): + t_index = ( + torch.tensor(t_idx) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .flatten() + ) + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(1, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(1, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + st_idx + ) + + mm_data_idx += 1 + video_frame_num += 1 + + else: + text_len = end_idx - start_idx + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + video_frame_num = 1 + + else: + text_len = len(input_tokens) + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1)) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + llm_positions = llm_positions[:, context_len:seq_len] + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + return llm_positions, mrope_position_delta + def forward( self, input_ids: torch.Tensor, From d6517be3cd06111ada0a603acaeab28dd4580641 Mon Sep 17 00:00:00 2001 From: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Date: Sat, 1 Nov 2025 01:16:00 +0800 Subject: [PATCH 051/976] [Bugfix] Missing NIXL metadata for handshake initialization if instance spans multi-node (#26338) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Guan Luo Signed-off-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Signed-off-by: Guan Luo <41310872+GuanLuo@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi --- docs/features/nixl_connector_usage.md | 2 +- .../kv_connector/unit/test_nixl_connector.py | 106 ++++++++- .../kv_transfer/kv_connector/v1/base.py | 32 +++ .../kv_connector/v1/nixl_connector.py | 224 +++++++++++------- vllm/v1/engine/core.py | 23 +- vllm/v1/executor/abstract.py | 8 + vllm/v1/worker/gpu_worker.py | 21 +- 7 files changed, 321 insertions(+), 95 deletions(-) diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index 605398652ee0b..1ce038f4d6525 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -81,7 +81,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \ - Default: 5600 - **Required for both prefiller and decoder instances** - Each vLLM worker needs a unique port on its host; using the same port number across different hosts is fine - - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank * tp_size + tp_rank (e.g., with `--tensor-parallel-size=4` and base_port=5600, tp_rank 0..3 use ports 5600, 5601, 5602, 5603 on that node). + - For TP/DP deployments, each worker's port on a node is computed as: base_port + dp_rank (e.g., with `--data-parallel-size=2` and base_port=5600, dp_rank 0..1 use port 5600, 5601 on that node). - Used for the initial NIXL handshake between the prefiller and the decoder - `VLLM_NIXL_SIDE_CHANNEL_HOST`: Host for side channel communication diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 445d115010cdf..44d8b3e331fdb 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -27,6 +27,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( NixlAgentMetadata, NixlConnector, NixlConnectorMetadata, + NixlConnectorScheduler, NixlConnectorWorker, NixlKVConnectorStats, ) @@ -283,6 +284,92 @@ def test_prompt_less_than_block_size(): assert len(scheduler_output.scheduled_new_reqs) == 0 +@patch( + "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", + FakeNixlWrapper, +) +def test_kv_transfer_handshake(dist_init): + """Unit test for basic NixlConnector interface functionality.""" + + # Test setup, we creates a scheduler that contains a NixlConnector + # of role SCHEDULER, and expect it to be serving NixlAgentMetadata from + # all workers of the instance. + vllm_config = create_vllm_config() + # in case the test runs on non-GPU machine + vllm_config.kv_transfer_config.kv_buffer_device = "cpu" + scheduler = create_scheduler(vllm_config) + + # Create two NixlConnector of role WORKER, one is the worker of + # the scheduler (prefill), the other is a worker of decode instance. + + # Prefill connector will register KV cache to populate proper handshake + # metadata. + prefill_connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape( + num_blocks=2, block_size=16, num_kv_heads=4, head_size=64 + ) + shared_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16) + unique_tensor = torch.zeros(*kv_cache_shape, dtype=torch.float16) + kv_caches = { + "layer0": shared_tensor, + "layer1": unique_tensor, + "layer2": shared_tensor, + } + prefill_connector.register_kv_caches(kv_caches) + + # Simulate EngineCore initialization that would + # gather connector metadata from all workers, the scheduler connector + # expects metadata to be in dict[int, KVConnectorHandshakeMetadata], + # where the first key is the dp_rank, the second key is the tp_rank. + metadata = {0: prefill_connector.get_handshake_metadata()} + scheduler_connector = scheduler.get_kv_connector() + scheduler_connector.set_xfer_handshake_metadata(metadata) + + # Simulate a request that finishes prefill, which returns + # corresponding NixlConnectorMetadata for decode instance. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + request = create_request( + request_id=1, + block_size=BLOCK_SIZE, + num_tokens=NUM_TOKENS, + do_remote_decode=True, + ) + request.status = RequestStatus.FINISHED_LENGTH_CAPPED + delay, kv_connector_metadata = scheduler.get_kv_connector().request_finished( + request, [0, 1, 2] + ) + assert delay + + # Decode connector will be able to create handshake with the prefill connector. + decode_connector = NixlConnector(vllm_config, KVConnectorRole.WORKER) + + # Here we are testing the retrieval of NIXLAgentMetadata. + # Knowing the implementation detail, we override the add_remote_agent + # to validate the metadata received is the same as the one in prefill_connector. + with patch.object( + decode_connector.connector_worker, "add_remote_agent" + ) as mock_add_remote_agent: + mock_add_remote_agent.return_type = "remote_agent" + + decode_connector.connector_worker._nixl_handshake( + kv_connector_metadata["remote_host"], + kv_connector_metadata["remote_port"], + kv_connector_metadata["tp_size"], + kv_connector_metadata["remote_engine_id"], + ) + + received_metadata = mock_add_remote_agent.call_args.args + assert received_metadata[1] == 0 # remote_tp_rank + assert received_metadata[2] == 1 # remote_tp_size + assert metadata[0] == received_metadata[0] + + # Need to shutdown the background thread to release NIXL side channel port + scheduler_connector.shutdown() + + class FakeNixlConnectorWorker(NixlConnectorWorker): REMOTE_ENGINE_ID = "remote_engine" @@ -313,6 +400,7 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): engine_id=self.REMOTE_ENGINE_ID, agent_metadata=FakeNixlWrapper.AGENT_METADATA, kv_caches_base_addr=[0], + device_id=0, num_blocks=1, block_lens=self.block_len_per_layer, attn_backend_name=self.backend_name, @@ -559,6 +647,7 @@ class TestNixlHandshake: engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID, agent_metadata=FakeNixlWrapper.AGENT_METADATA, kv_caches_base_addr=[0], + device_id=0, num_blocks=1, block_lens=worker.block_len_per_layer, attn_backend_name=worker.backend_name, @@ -611,6 +700,7 @@ class TestNixlHandshake: engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID, agent_metadata=FakeNixlWrapper.AGENT_METADATA, kv_caches_base_addr=[0], + device_id=0, num_blocks=1, # prefill TP=1, decode TP=2, remote block_lens is double to local block_lens=[i * 2 for i in worker.block_len_per_layer], @@ -1005,6 +1095,8 @@ def _run_abort_timeout_test(llm: LLM, timeout: int): _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params) # Request-0 times out and is cleared! assert "0" not in req_to_blocks + # Need to shutdown the background thread to release NIXL side channel port + llm.llm_engine.engine_core.shutdown() def test_register_kv_caches(dist_init): @@ -1177,13 +1269,15 @@ def test_shutdown_cleans_up_resources(dist_init): """Test that shutdown() properly cleans up all resources.""" vllm_config = create_vllm_config() + scheduler = NixlConnectorScheduler( + vllm_config, vllm_config.kv_transfer_config.engine_id + ) worker = NixlConnectorWorker(vllm_config, vllm_config.kv_transfer_config.engine_id) nixl_wrapper = worker.nixl_wrapper with ( patch.object(worker, "_handshake_initiation_executor") as mock_exec, - patch.object(worker, "_nixl_handshake_listener_t") as mock_listener, - patch.object(worker, "_nixl_handshake_listener_stop_event") as mock_event, + patch.object(scheduler, "_nixl_handshake_listener_t") as mock_listener, patch.object(nixl_wrapper, "release_xfer_handle") as mock_rel_xfer, patch.object(nixl_wrapper, "release_dlist_handle") as mock_rel_dlist, patch.object(nixl_wrapper, "remove_remote_agent") as mock_rem_agent, @@ -1204,8 +1298,12 @@ def test_shutdown_cleans_up_resources(dist_init): worker.shutdown() mock_exec.shutdown.assert_called_with(wait=False) - mock_event.set.assert_called_once() - mock_listener.join.assert_called_once_with(timeout=1.0) + + # Same sequence on scheduler.shutdown() + scheduler.shutdown() + scheduler.shutdown() + scheduler.shutdown() + mock_listener.join.assert_called_once() mock_rel_xfer.assert_called_once_with(123) assert mock_rel_dlist.call_count == 2 diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 2ed0fe592e373..cb9f208a839f2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -122,6 +122,15 @@ class KVConnectorRole(enum.Enum): WORKER = 1 +class KVConnectorHandshakeMetadata(ABC): # noqa: B024 + """ + Metadata used for out of band connector handshake between + P/D workers. This needs to serializeable. + """ + + pass + + class KVConnectorMetadata(ABC): # noqa: B024 """ Abstract Metadata used to communicate between the @@ -320,6 +329,18 @@ class KVConnectorBase_V1(ABC): """ return None + def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None: + """ + Get the KVConnector handshake metadata for this connector. + This metadata is used for out-of-band connector handshake + between P/D workers. + + Returns: + KVConnectorHandshakeMetadata: the handshake metadata. + None if no handshake metadata is available. + """ + return None + # ============================== # Scheduler-side methods # ============================== @@ -477,6 +498,17 @@ class KVConnectorBase_V1(ABC): """ return None + def set_xfer_handshake_metadata( + self, metadata: dict[int, KVConnectorHandshakeMetadata] + ) -> None: + """ + Set the KV connector handshake metadata for this connector. + + Args: + metadata (KVConnectorHandshakeMetadata): the handshake metadata to set. + """ + return None + @classmethod def build_prom_metrics( cls, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index d5712bdd9feb4..4651cedbc7dfa 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -27,6 +27,7 @@ from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( CopyBlocksOp, KVConnectorBase_V1, + KVConnectorHandshakeMetadata, KVConnectorMetadata, KVConnectorRole, ) @@ -93,15 +94,12 @@ _NIXL_SUPPORTED_DEVICE = { _NIXL_SUPPORTED_DEVICE.update(current_platform.get_nixl_supported_devices()) -class NixlAgentMetadata( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - # required for @cached_property. - dict=True, -): +@dataclass +class NixlAgentMetadata(KVConnectorHandshakeMetadata): engine_id: str agent_metadata: bytes kv_caches_base_addr: list[int] + device_id: int num_blocks: int block_lens: list[int] attn_backend_name: str @@ -223,6 +221,18 @@ class NixlConnector(KVConnectorBase_V1): assert self.connector_scheduler is not None return self.connector_scheduler.request_finished(request, block_ids) + def set_xfer_handshake_metadata( + self, metadata: dict[int, KVConnectorHandshakeMetadata] + ) -> None: + """ + Set the KV connector handshake metadata for this connector. + + Args: + metadata (dict): the handshake metadata to set. + """ + assert self.connector_scheduler is not None + self.connector_scheduler.set_xfer_handshake_metadata(metadata) + ############################################################ # Worker Side Methods ############################################################ @@ -299,6 +309,21 @@ class NixlConnector(KVConnectorBase_V1): def shutdown(self): if self.connector_worker is not None: self.connector_worker.shutdown() + if self.connector_scheduler is not None: + self.connector_scheduler.shutdown() + + def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None: + """ + Get the KVConnector handshake metadata for this connector. + This metadata is used for out-of-band connector handshake + between P/D workers. + + Returns: + KVConnectorHandshakeMetadata: the handshake metadata. + None if no handshake metadata is available. + """ + assert self.connector_worker is not None + return self.connector_worker.xfer_handshake_metadata class NixlConnectorScheduler: @@ -312,12 +337,16 @@ class NixlConnectorScheduler: self.side_channel_port = ( envs.VLLM_NIXL_SIDE_CHANNEL_PORT + vllm_config.parallel_config.data_parallel_rank - * vllm_config.parallel_config.tensor_parallel_size ) assert vllm_config.kv_transfer_config is not None self.use_host_buffer = vllm_config.kv_transfer_config.kv_buffer_device == "cpu" logger.info("Initializing NIXL Scheduler %s", engine_id) + # Background thread for handling new handshake requests. + self._nixl_handshake_listener_t: threading.Thread | None = None + self._encoded_xfer_handshake_metadata: dict[int, Any] = {} + self._stop_event = threading.Event() + # Requests that need to start recv/send. # New requests are added by update_state_after_alloc in # the scheduler. Used to make metadata passed to Worker. @@ -330,6 +359,89 @@ class NixlConnectorScheduler: # remote prefill or aborted. self._reqs_not_processed: set[ReqId] = set() + def shutdown(self): + self._stop_event.set() + if self._nixl_handshake_listener_t is not None: + self._nixl_handshake_listener_t.join() + self._nixl_handshake_listener_t = None + + def set_xfer_handshake_metadata( + self, metadata: dict[int, KVConnectorHandshakeMetadata] + ) -> None: + """ + Set the KV connector handshake metadata for this connector. + + Args: + metadata (dict): the handshake metadata to set. + """ + encoded_data: dict[int, bytes] = {} + encoder = msgspec.msgpack.Encoder() + for tp_rank, rank_metadata in metadata.items(): + if not isinstance(rank_metadata, NixlAgentMetadata): + raise ValueError( + "NixlConnectorScheduler expects NixlAgentMetadata for " + "handshake metadata." + ) + encoded_data[tp_rank] = encoder.encode(rank_metadata) + logger.debug( + "Tp rank %d: encoded NixlAgentMetadata size: %s bytes", + tp_rank, + str(len(encoded_data[tp_rank])), + ) + self._encoded_xfer_handshake_metadata = encoded_data + + # Only start the listener when we have metadata to serve. + if self._nixl_handshake_listener_t is None: + ready_event = threading.Event() + self._nixl_handshake_listener_t = threading.Thread( + target=self._nixl_handshake_listener, + args=( + encoded_data, + ready_event, + self._stop_event, + self.side_channel_port, + ), + daemon=True, + name="nixl_handshake_listener", + ) + self._nixl_handshake_listener_t.start() + ready_event.wait() # Wait for listener ZMQ socket to be ready. + + @staticmethod + def _nixl_handshake_listener( + encoded_data: dict[int, Any], + ready_event: threading.Event, + stop_event: threading.Event, + port: int, + ): + """Background thread for getting new NIXL handshakes.""" + # NOTE(rob): this is a simple implementation. We will move + # to a better approach via HTTP endpoint soon. + + # Listen for new requests for metadata. + host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST + path = make_zmq_path("tcp", host, port) + logger.debug("Starting listening on path: %s", path) + with zmq_ctx(zmq.ROUTER, path) as sock: + sock.setsockopt(zmq.RCVTIMEO, 1000) + ready_event.set() + while True: + try: + identity, _, msg = sock.recv_multipart() + except zmq.Again: + if stop_event.is_set(): + break + continue + # Decode the message which contains (GET_META_MSG, rank) + msg, target_tp_rank = msgspec.msgpack.decode(msg) + logger.debug( + "Received message for tp rank %s", + target_tp_rank, + ) + if msg != GET_META_MSG: + logger.warning("Connection listener got unexpected message %s", msg) + sock.send_multipart((identity, b"", encoded_data[target_tp_rank])) + def get_num_new_matched_tokens( self, request: "Request", num_computed_tokens: int ) -> tuple[int, bool]: @@ -537,8 +649,6 @@ class NixlConnectorScheduler: class NixlConnectorWorker: """Implementation of Worker side methods""" - _POLL_TIMEOUT = 0.1 # Handshake thread polls for stop event every 100ms - @dataclass class TpKVTopology: """ @@ -651,16 +761,6 @@ class NixlConnectorWorker: # Map of engine_id -> {rank0: agent_name0, rank1: agent_name1..}. self._remote_agents: dict[EngineId, dict[int, str]] = defaultdict(dict) - # NIXL handshake port. - # NOTE(rob): Within a DP group, each DP rank gets its own - # base port (which is sent in the KVTransferParams). - # Each TP rank listens/queries on the base_port + tp_rank. - self.side_channel_port: int = ( - envs.VLLM_NIXL_SIDE_CHANNEL_PORT - + vllm_config.parallel_config.data_parallel_rank - * vllm_config.parallel_config.tensor_parallel_size - ) - # Metadata. self.engine_id: EngineId = engine_id self.tp_rank = get_tensor_model_parallel_rank() @@ -706,6 +806,7 @@ class NixlConnectorWorker: # Map of engine_id -> kv_caches_base_addr. For TP case, each local # rank will still only pull from a single remote TP worker. self.kv_caches_base_addr: dict[EngineId, list[int]] = {} + self.device_id: int = 0 # Number of NIXL regions. Currently one region per cache # (so 1 per layer for MLA, otherwise 2 per layer) @@ -736,9 +837,8 @@ class NixlConnectorWorker: # requests that skipped transfer (handshake or transfer failures) self._failed_recv_reqs: set[ReqId] = set() - # Background thread for handling new handshake requests. - self._nixl_handshake_listener_t: threading.Thread | None = None - self._nixl_handshake_listener_stop_event: threading.Event | None = None + # Handshake metadata of this worker for NIXL transfers. + self.xfer_handshake_metadata: NixlAgentMetadata | None = None # Background thread for initializing new NIXL handshakes. self._handshake_initiation_executor = ThreadPoolExecutor( # NIXL is not guaranteed to be thread-safe, limit 1 worker. @@ -790,42 +890,6 @@ class NixlConnectorWorker: total_num_kv_heads=self.model_config.get_total_num_kv_heads(), ) - @staticmethod - def _nixl_handshake_listener( - metadata: NixlAgentMetadata, - ready_event: threading.Event, - stop_event: threading.Event, - base_port: int, - tp_rank: int, - ): - """Background thread for getting new NIXL handshakes.""" - # NOTE(rob): this is a simple implementation. We will move - # to a better approach via HTTP endpoint soon. - - encoder = msgspec.msgpack.Encoder() - encoded_data = encoder.encode(metadata) - size_in_bytes = len(encoded_data) - logger.debug("Size of encoded NixlAgentMetadata: %s bytes", str(size_in_bytes)) - - # Listen for new requests for metadata. - host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST - path = make_zmq_path("tcp", host, base_port + tp_rank) - logger.debug("Starting listening on path: %s", path) - with zmq_ctx(zmq.ROUTER, path) as sock: - ready_event.set() - poller = zmq.Poller() - poller.register(sock, zmq.POLLIN) - while not stop_event.is_set(): - events = dict( - poller.poll(timeout=NixlConnectorWorker._POLL_TIMEOUT * 1000) - ) - if sock not in events: - continue - identity, _, msg = sock.recv_multipart() - if msg != GET_META_MSG: - logger.warning("Connection listener got unexpected message %s", msg) - sock.send_multipart((identity, b"", encoded_data)) - def _nixl_handshake( self, host: str, @@ -844,16 +908,17 @@ class NixlConnectorWorker: # Handshake only with the remote TP rank that current local rank will # pull from. With homogeneous TP it happens to be the same rank_i. p_remote_rank = self.kv_topo.get_target_remote_rank(remote_tp_size) - path = make_zmq_path("tcp", host, port + p_remote_rank) + path = make_zmq_path("tcp", host, port) logger.debug( - "Querying metadata on path: %s at remote rank %s", path, p_remote_rank + "Querying metadata on path: %s at remote tp rank %s", path, p_remote_rank ) # Send query for the request. with zmq_ctx(zmq.REQ, path) as sock: + msg = msgspec.msgpack.encode((GET_META_MSG, p_remote_rank)) # Set receive timeout to 5 seconds to avoid hanging on dead server sock.setsockopt(zmq.RCVTIMEO, 5000) # milliseconds - sock.send(GET_META_MSG) + sock.send(msg) metadata_bytes = sock.recv() decoder = msgspec.msgpack.Decoder(NixlAgentMetadata) metadata = decoder.decode(metadata_bytes) @@ -1042,6 +1107,10 @@ class NixlConnectorWorker: assert tensor_size_bytes == curr_tensor_size_bytes, ( "All kv cache tensors must have the same size" ) + # Need to make sure the device ID is non-negative for NIXL, + # Torch uses -1 to indicate CPU tensors while NIXL uses explicit + # memory type. + self.device_id = max(cache.get_device(), 0) caches_data.append( (base_addr, curr_tensor_size_bytes, self.device_id, "") ) @@ -1139,10 +1208,11 @@ class NixlConnectorWorker: assert len(self.block_window_per_layer) == self.num_layers # After KV Caches registered, listen for new connections. - metadata = NixlAgentMetadata( + self.xfer_handshake_metadata = NixlAgentMetadata( engine_id=self.engine_id, agent_metadata=self.nixl_wrapper.get_agent_metadata(), kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id], + device_id=self.device_id, num_blocks=self.num_blocks, block_lens=self.block_len_per_layer, attn_backend_name=self.backend_name, @@ -1150,22 +1220,6 @@ class NixlConnectorWorker: if not self.use_host_buffer else self.host_buffer_kv_cache_layout, ) - ready_event, stop_event = threading.Event(), threading.Event() - self._nixl_handshake_listener_t = threading.Thread( - target=self._nixl_handshake_listener, - args=( - metadata, - ready_event, - stop_event, - self.side_channel_port, - self.tp_rank, - ), - daemon=True, - name="nixl_handshake_listener", - ) - self._nixl_handshake_listener_t.start() - self._nixl_handshake_listener_stop_event = stop_event - ready_event.wait() # Wait for listener ZMQ socket to be ready. def add_remote_agent( self, @@ -1267,7 +1321,7 @@ class NixlConnectorWorker: # self.block_len == remote_block_len//tp_ratio bytes. addr = base_addr + block_offset + rank_offset # (addr, len, device id) - blocks_data.append((addr, kv_block_len, remote_tp_rank)) + blocks_data.append((addr, kv_block_len, nixl_agent_meta.device_id)) if self._use_flashinfer: # With FlashInfer index V separately to allow head splitting. @@ -1275,7 +1329,9 @@ class NixlConnectorWorker: block_offset = block_id * nixl_agent_meta.block_lens[i] addr = base_addr + block_offset + rank_offset v_addr = addr + nixl_agent_meta.block_lens[i] // 2 - blocks_data.append((v_addr, kv_block_len, remote_tp_rank)) + blocks_data.append( + (v_addr, kv_block_len, nixl_agent_meta.device_id) + ) logger.debug( "Created %s blocks for dst engine %s with remote rank %s and local rank %s", @@ -1843,14 +1899,6 @@ class NixlConnectorWorker: def shutdown(self): """Shutdown the connector worker.""" self._handshake_initiation_executor.shutdown(wait=False) - if self._nixl_handshake_listener_stop_event is not None: - self._nixl_handshake_listener_stop_event.set() - self._nixl_handshake_listener_stop_event = None - if self._nixl_handshake_listener_t is not None: - # Generous timeout to allow the thread to exit - self._nixl_handshake_listener_t.join(timeout=self._POLL_TIMEOUT * 10) - assert not self._nixl_handshake_listener_t.is_alive() - self._nixl_handshake_listener_t = None for handles in self._recving_transfers.values(): for handle, _ in handles: self.nixl_wrapper.release_xfer_handle(handle) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 6cbd986b3cd32..bfe87b718282c 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -163,6 +163,27 @@ class EngineCore: vllm_config, mm_registry ) + # If a KV connector is initialized for scheduler, we want to collect + # handshake metadata from all workers so the connector in the scheduler + # will have the full context + kv_connector = self.scheduler.get_kv_connector() + if kv_connector is not None: + # Collect and store KV connector xfer metadata from workers + # (after KV cache registration) + xfer_handshake_metadata = ( + self.model_executor.get_kv_connector_handshake_metadata() + ) + + if xfer_handshake_metadata: + # xfer_handshake_metadata is list of dicts from workers + # Each dict already has structure {tp_rank: metadata} + # Merge all worker dicts into a single dict + content: dict[int, Any] = {} + for worker_dict in xfer_handshake_metadata: + if worker_dict is not None: + content.update(worker_dict) + kv_connector.set_xfer_handshake_metadata(content) + # Setup batch queue for pipeline parallelism. # Batch queue for scheduled batches. This enables us to asynchronously # schedule and execute batches, and is required by pipeline parallelism @@ -178,7 +199,7 @@ class EngineCore: self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None if ( self.vllm_config.cache_config.enable_prefix_caching - or self.scheduler.get_kv_connector() is not None + or kv_connector is not None ): caching_hash_fn = get_hash_fn_by_name( vllm_config.cache_config.prefix_caching_hash_algo diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 9fe1912c73e39..ef7840e1796f7 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -9,6 +9,9 @@ from typing import TYPE_CHECKING, Literal, TypeVar, overload from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorHandshakeMetadata, +) from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.tasks import SupportedTask @@ -177,6 +180,11 @@ class Executor(ABC): ): raise NotImplementedError + def get_kv_connector_handshake_metadata( + self, + ) -> list[dict[int, KVConnectorHandshakeMetadata]]: + return self.collective_rpc("get_kv_connector_handshake_metadata") + @overload def execute_model( self, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 54c5f81fc7e8e..5b11bdf5282fa 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -19,7 +19,11 @@ from vllm.distributed import ( init_distributed_environment, set_custom_all_reduce, ) -from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized +from vllm.distributed.kv_transfer import ( + ensure_kv_transfer_initialized, + get_kv_transfer_group, + has_kv_transfer_group, +) from vllm.distributed.parallel_state import ( get_pp_group, get_tp_group, @@ -348,6 +352,21 @@ class Worker(WorkerBase): return int(self.available_kv_cache_memory_bytes) + def get_kv_connector_handshake_metadata(self) -> dict | None: + """Get KV connector metadata from this worker if available.""" + + if not has_kv_transfer_group(): + return None + + connector = get_kv_transfer_group() + # Return None for connectors that don't need to exchange handshake + # metadata across workers. + if (metadata := connector.get_handshake_metadata()) is None: + return None + + tp_rank = get_tp_group().rank_in_group + return {tp_rank: metadata} + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return self.model_runner.get_kv_cache_spec() From 70bfbd7b168a216c6b5cb4db678a48b1e3f9c720 Mon Sep 17 00:00:00 2001 From: Rob Mulla Date: Fri, 31 Oct 2025 13:29:55 -0400 Subject: [PATCH 052/976] Docs update tpu install instructions (#27824) Signed-off-by: Rob Mulla Signed-off-by: Rob Mulla Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/configuration/tpu.md | 2 +- docs/getting_started/installation/.nav.yml | 2 +- docs/getting_started/installation/README.md | 2 +- .../installation/google_tpu.md | 193 ------------------ docs/getting_started/quickstart.md | 11 + 5 files changed, 14 insertions(+), 196 deletions(-) delete mode 100644 docs/getting_started/installation/google_tpu.md diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md index 25d371e627b75..2d24c9c6e2e95 100644 --- a/docs/configuration/tpu.md +++ b/docs/configuration/tpu.md @@ -4,7 +4,7 @@ This doc serves as a collection of handy tips for optimizing your vLLM on TPU wo ## Get started -Looking for setup and installation instructions? Find them [here](../getting_started/installation/google_tpu.md). +Looking for setup and installation instructions? Find them [here](https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/). ### TPU workload sizing diff --git a/docs/getting_started/installation/.nav.yml b/docs/getting_started/installation/.nav.yml index ba1f8099a6456..683322cf3b7b0 100644 --- a/docs/getting_started/installation/.nav.yml +++ b/docs/getting_started/installation/.nav.yml @@ -2,4 +2,4 @@ nav: - README.md - gpu.md - cpu.md - - google_tpu.md + - TPU: https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/ diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index a4e63e426b9ba..d5082bc7dd3a9 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -11,7 +11,6 @@ vLLM supports the following hardware platforms: - [ARM AArch64](cpu.md#arm-aarch64) - [Apple silicon](cpu.md#apple-silicon) - [IBM Z (S390X)](cpu.md#ibm-z-s390x) -- [Google TPU](google_tpu.md) ## Hardware Plugins @@ -20,6 +19,7 @@ The backends below live **outside** the main `vllm` repository and follow the | Accelerator | PyPI / package | Repository | |-------------|----------------|------------| +| Google TPU | `tpu-inference` | | | Ascend NPU | `vllm-ascend` | | | Intel Gaudi (HPU) | N/A, install from source | | | MetaX MACA GPU | N/A, install from source | | diff --git a/docs/getting_started/installation/google_tpu.md b/docs/getting_started/installation/google_tpu.md deleted file mode 100644 index 0f8c5bccd4b95..0000000000000 --- a/docs/getting_started/installation/google_tpu.md +++ /dev/null @@ -1,193 +0,0 @@ -# Google TPU - -Tensor Processing Units (TPUs) are Google's custom-developed application-specific -integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs -are available in different versions each with different hardware specifications. -For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm). -For more information on the TPU versions supported with vLLM, see: - -- [TPU v6e](https://cloud.google.com/tpu/docs/v6e) -- [TPU v5e](https://cloud.google.com/tpu/docs/v5e) -- [TPU v5p](https://cloud.google.com/tpu/docs/v5p) -- [TPU v4](https://cloud.google.com/tpu/docs/v4) - -These TPU versions allow you to configure the physical arrangements of the TPU -chips. This can improve throughput and networking performance. For more -information see: - -- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations) -- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config) -- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config) -- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config) - -In order for you to use Cloud TPUs you need to have TPU quota granted to your -Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a -GPC project and are specified in terms of TPU version, the number of TPU you -want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota). - -For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing). - -You may need additional persistent storage for your TPU VMs. For more -information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options). - -!!! warning - There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source. - -## Requirements - -- Google Cloud TPU VM -- TPU versions: v6e, v5e, v5p, v4 -- Python: 3.11 or newer - -### Provision Cloud TPUs - -You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest) -or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources) -API (preferred). This section shows how to create TPUs using the queued resource API. For -more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api). -Queued resources enable you to request Cloud TPU resources in a queued manner. -When you request queued resources, the request is added to a queue maintained by -the Cloud TPU service. When the requested resource becomes available, it's -assigned to your Google Cloud project for your immediate exclusive use. - -!!! note - In all of the following commands, replace the ALL CAPS parameter names with - appropriate values. See the parameter descriptions table for more information. - -### Provision Cloud TPUs with GKE - -For more information about using TPUs with GKE, see: - -- [About TPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/tpus) -- [Deploy TPU workloads in GKE Standard](https://cloud.google.com/kubernetes-engine/docs/how-to/tpus) -- [Plan for TPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus) - -## Configure a new environment - -### Provision a Cloud TPU with the queued resource API - -Create a TPU v5e with 4 TPU chips: - -```bash -gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ - --node-id TPU_NAME \ - --project PROJECT_ID \ - --zone ZONE \ - --accelerator-type ACCELERATOR_TYPE \ - --runtime-version RUNTIME_VERSION \ - --service-account SERVICE_ACCOUNT -``` - -| Parameter name | Description | -|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request. | -| TPU_NAME | The user-assigned name of the TPU which is created when the queued resource request is allocated. | -| PROJECT_ID | Your Google Cloud project | -| ZONE | The GCP zone where you want to create your Cloud TPU. The value you use depends on the version of TPUs you are using. For more information, see [TPU regions and zones] | -| ACCELERATOR_TYPE | The TPU version you want to use. Specify the TPU version, for example `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information, see [TPU versions]. | -| RUNTIME_VERSION | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). | -| SERVICE_ACCOUNT | The email address for your service account. You can find it in the IAM Cloud Console under *Service Accounts*. For example: `tpu-service-account@.iam.gserviceaccount.com` | - -Connect to your TPU VM using SSH: - -```bash -gcloud compute tpus tpu-vm ssh TPU_NAME --project PROJECT_ID --zone ZONE -``` - -!!! note - When configuring `RUNTIME_VERSION` ("TPU software version") on GCP, ensure it matches the TPU generation you've selected by referencing the [TPU VM images] compatibility matrix. Using an incompatible version may prevent vLLM from running correctly. - -[TPU versions]: https://cloud.google.com/tpu/docs/runtimes -[TPU VM images]: https://cloud.google.com/tpu/docs/runtimes -[TPU regions and zones]: https://cloud.google.com/tpu/docs/regions-zones - -## Set up using Python - -### Pre-built wheels - -Currently, there are no pre-built TPU wheels. - -### Build wheel from source - -Install Miniconda: - -```bash -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -bash Miniconda3-latest-Linux-x86_64.sh -source ~/.bashrc -``` - -Create and activate a Conda environment for vLLM: - -```bash -conda create -n vllm python=3.12 -y -conda activate vllm -``` - -Clone the vLLM repository and go to the vLLM directory: - -```bash -git clone https://github.com/vllm-project/vllm.git && cd vllm -``` - -Uninstall the existing `torch` and `torch_xla` packages: - -```bash -pip uninstall torch torch-xla -y -``` - -Install build dependencies: - -```bash -pip install -r requirements/tpu.txt -sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev -``` - -Run the setup script: - -```bash -VLLM_TARGET_DEVICE="tpu" python -m pip install -e . -``` - -## Set up using Docker - -### Pre-built images - -See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`. - -### Build image from source - -You can use [docker/Dockerfile.tpu](../../../docker/Dockerfile.tpu) to build a Docker image with TPU support. - -```bash -docker build -f docker/Dockerfile.tpu -t vllm-tpu . -``` - -Run the Docker image with the following command: - -```bash -# Make sure to add `--privileged --net host --shm-size=16G`. -docker run --privileged --net host --shm-size=16G -it vllm-tpu -``` - -!!! note - Since TPU relies on XLA which requires static shapes, vLLM bucketizes the - possible input shapes and compiles an XLA graph for each shape. The - compilation time may take 20~30 minutes in the first run. However, the - compilation time reduces to ~5 minutes afterwards because the XLA graphs are - cached in the disk (in `VLLM_XLA_CACHE_PATH` or `~/.cache/vllm/xla_cache` by default). - -!!! tip - If you encounter the following error: - - ```console - from torch._C import * # noqa: F403 - ImportError: libopenblas.so.0: cannot open shared object file: No such - file or directory - ``` - - Install OpenBLAS with the following command: - - ```bash - sudo apt-get install --no-install-recommends --yes libopenblas-base libopenmpi-dev libomp-dev - ``` diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 70a91b7454ceb..cfc8b4d9838a7 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -63,6 +63,17 @@ This guide will help you quickly get started with vLLM to perform: rocm/vllm-dev:nightly ``` +=== "Google TPU" + + To run vLLM on Google TPUs, you need to install the `vllm-tpu` package. + + ```bash + uv pip install vllm-tpu + ``` + + !!! note + For more detailed instructions, including Docker, installing from source, and troubleshooting, please refer to the [vLLM on TPU documentation](https://docs.vllm.ai/projects/tpu/en/latest/). + !!! note For more detail and non-CUDA platforms, please refer [here](installation/README.md) for specific instructions on how to install vLLM. From 103a468bbfd3d848cdfa5845909500d58a43119a Mon Sep 17 00:00:00 2001 From: Chenguang Zheng <645327136@qq.com> Date: Sat, 1 Nov 2025 01:34:27 +0800 Subject: [PATCH 053/976] [bugfix] Missing cached item in beam search (#27874) Signed-off-by: fake0fan <645327136@qq.com> Co-authored-by: Cyrus Leung --- vllm/entrypoints/openai/serving_engine.py | 28 ++++++++--------------- 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index c0750cd641667..46e79edbde611 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -345,22 +345,7 @@ class OpenAIServing: if is_explicit_encoder_decoder_prompt(prompt): raise NotImplementedError - else: - processed_inputs = processor.input_preprocessor._prompt_to_llm_inputs( - prompt - ) - if processed_inputs["type"] == "embeds": - raise NotImplementedError - - # This is a workaround to fix multimodal beam search; this is a - # bandaid fix for 2 small problems: - # 1. Multi_modal_data on the processed_inputs currently resolves to - # `None`. - # 2. preprocessing above expands the multimodal placeholders. However, - # this happens again in generation, so the double expansion causes - # a mismatch. - # TODO - would be ideal to handle this more gracefully. prompt_text: str | None prompt_token_ids: list[int] multi_modal_data: MultiModalDataDict | None @@ -373,9 +358,16 @@ class OpenAIServing: prompt_token_ids = prompt.get("prompt_token_ids", []) # type: ignore multi_modal_data = prompt.get("multi_modal_data") # type: ignore - mm_processor_kwargs: dict[str, Any] | None = processed_inputs.get( - "mm_processor_kwargs" - ) # type: ignore + mm_processor_kwargs: dict[str, Any] | None = None + + # This is a workaround to fix multimodal beam search; this is a + # bandaid fix for 2 small problems: + # 1. Multi_modal_data on the processed_inputs currently resolves to + # `None`. + # 2. preprocessing above expands the multimodal placeholders. However, + # this happens again in generation, so the double expansion causes + # a mismatch. + # TODO - would be ideal to handle this more gracefully. tokenized_length = len(prompt_token_ids) From bc306fe5e97823e2a2e989725bd5e39a897a43a6 Mon Sep 17 00:00:00 2001 From: ZiTian Zhao Date: Sat, 1 Nov 2025 01:38:02 +0800 Subject: [PATCH 054/976] fix incorrect type annotation in KimiMLP (#27885) Signed-off-by: zitian.zhao --- vllm/model_executor/models/kimi_linear.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index f8df72b067dd0..cce22842d3330 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -22,7 +22,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, MergedColumnParallelLinear, - QKVParallelLinear, ReplicatedLinear, RowParallelLinear, ) @@ -61,7 +60,7 @@ class KimiMLP(nn.Module): hidden_size: int, intermediate_size: int, hidden_act: str, - quant_config: QKVParallelLinear | None = None, + quant_config: QuantizationConfig | None = None, reduce_results: bool = True, prefix: str = "", ) -> None: From fc16f1c4779023e2e9ca3efe7a8b78a27cd318ac Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Fri, 31 Oct 2025 10:54:29 -0700 Subject: [PATCH 055/976] Flashinfer_CUTLASS_MOE fuses quantization for TP (#27223) Signed-off-by: Shu Wang. --- .../fused_moe/flashinfer_cutlass_moe.py | 6 ++++- .../flashinfer_cutlass_prepare_finalize.py | 17 +++++++------- .../layers/quantization/modelopt.py | 23 ------------------- .../quantization/utils/flashinfer_fp4_moe.py | 1 + 4 files changed, 15 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index b7820319682be..85ce77fb1f7f7 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -56,6 +56,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): ep_size: int = 1, tp_rank: int = 0, tp_size: int = 1, + use_dp: bool = False, ): super().__init__(quant_config) assert quant_config.quant_dtype in ("nvfp4", torch.float8_e4m3fn, None), ( @@ -67,6 +68,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): self.tp_rank = tp_rank self.tp_size = tp_size self.out_dtype = out_dtype + self.use_dp = use_dp @property def activation_formats( @@ -117,7 +119,8 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): """ workspace1 = (M, K) workspace2 = (0,) - output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" else K) + # For TP, the quantization is fused with fused_moe call. + output_shape = (M, K * 2 if self.quant_dtype == "nvfp4" and self.use_dp else K) # The workspace is determined by `aq`, since it comes after any # potential communication op and is involved in the expert computation. return (workspace1, workspace2, output_shape) @@ -214,6 +217,7 @@ def flashinfer_cutlass_moe_fp4( FlashInferExperts( out_dtype=hidden_states.dtype, quant_config=quant_config, + use_dp=False, ), ) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index 20e2f6c851861..051abbcb7949d 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -170,6 +170,8 @@ class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFin self._apply_router_weight_on_input( a1, topk_weights, topk_ids, apply_router_weight_on_input ) + if not self.use_dp: + return a1, None, None, topk_ids, topk_weights a1q, a1q_scale = moe_kernel_quantize_input( a1, @@ -179,14 +181,13 @@ class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFin quant_config.block_shape, is_fp4_scale_swizzled=not self.use_dp, ) - if self.use_dp: - topk_weights, topk_ids, a1q, a1q_scale = get_dp_group().all_gatherv( - [topk_weights, topk_ids, a1q, a1q_scale], - dim=0, - sizes=get_local_sizes(), - ) - if quant_config.quant_dtype == "nvfp4": - a1q_scale = nvfp4_block_scale_interleave(a1q_scale) + topk_weights, topk_ids, a1q, a1q_scale = get_dp_group().all_gatherv( + [topk_weights, topk_ids, a1q, a1q_scale], + dim=0, + sizes=get_local_sizes(), + ) + if quant_config.quant_dtype == "nvfp4": + a1q_scale = nvfp4_block_scale_interleave(a1q_scale) return a1q, a1q_scale, None, topk_ids, topk_weights diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 0eeeaa3ce457f..37b682984fc35 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1769,29 +1769,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input, ) - elif ( - self.allow_flashinfer - and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS - ): - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 - flashinfer_cutlass_moe_fp4, - ) - - assert self.moe_quant_config is not None - - return flashinfer_cutlass_moe_fp4( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - quant_config=self.moe_quant_config, - inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, - ) else: # If no modular kernel is provided, use cutlass_moe_fp4 for TP case # only (no EP). diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index b3a4cb2de1395..fdf330329e20c 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -79,6 +79,7 @@ def select_nvfp4_gemm_impl( ep_size=moe.moe_parallel_config.ep_size, tp_rank=moe.moe_parallel_config.tp_rank, tp_size=moe.moe_parallel_config.tp_size, + use_dp=moe.moe_parallel_config.dp_size > 1, ) # native cutlass experts currently don't support DP; TP case won't call this From 9e5bd3076e0b2dc9336ac230428424351426e2ef Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 31 Oct 2025 10:57:45 -0700 Subject: [PATCH 056/976] [Cleanup] Remove no-longer-used `SpeculativeConfig.enable_chunked_prefill` (#27826) Signed-off-by: Nick Hill --- vllm/config/speculative.py | 10 ---------- vllm/engine/arg_utils.py | 6 ------ vllm/entrypoints/openai/api_server.py | 1 + 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 903b9a26fab88..1f956526dcdc6 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -78,10 +78,6 @@ class SpeculativeConfig: draft_tensor_parallel_size: int | None = Field(default=None, ge=1) """The degree of the tensor parallelism for the draft model. Can only be 1 or the same as the target model's tensor parallel size.""" - disable_logprobs: bool = True - """If set to True, token log probabilities are not returned during - speculative decoding. If set to False, token log probabilities are returned - according to the log probability settings in SamplingParams.""" # Draft model configuration quantization: me_quant.QuantizationMethods | None = None @@ -126,12 +122,6 @@ class SpeculativeConfig: """The configuration of the target model.""" target_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore """The parallel configuration for the target model.""" - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore - """Whether vLLM is configured to use chunked prefill or not. Used for - raising an error since it's not yet compatible with speculative decode.""" - disable_log_stats: SkipValidation[bool] = None # type: ignore - """Whether to disable the periodic printing of stage times in speculative - decoding.""" # params generated in the post-init stage draft_model_config: SkipValidation[ModelConfig] = None # type: ignore diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b31e4931f2295..4e2c389bf84d3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1246,8 +1246,6 @@ class EngineArgs: self, target_model_config: ModelConfig, target_parallel_config: ParallelConfig, - enable_chunked_prefill: bool, - disable_log_stats: bool, ) -> SpeculativeConfig | None: """Initializes and returns a SpeculativeConfig object based on `speculative_config`. @@ -1267,8 +1265,6 @@ class EngineArgs: { "target_model_config": target_model_config, "target_parallel_config": target_parallel_config, - "enable_chunked_prefill": enable_chunked_prefill, - "disable_log_stats": disable_log_stats, } ) return SpeculativeConfig(**self.speculative_config) @@ -1561,8 +1557,6 @@ class EngineArgs: speculative_config = self.create_speculative_config( target_model_config=model_config, target_parallel_config=parallel_config, - enable_chunked_prefill=self.enable_chunked_prefill, - disable_log_stats=self.disable_log_stats, ) # make sure num_lookahead_slots is set appropriately depending on diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f3aa5351e5302..8fa71855f8f66 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -241,6 +241,7 @@ async def build_async_engine_client_from_engine_args( ) # Don't keep the dummy data in memory + assert async_llm is not None await async_llm.reset_mm_cache() yield async_llm From 5e8862e9e0f5c81d81a1ee46248cc281edc42596 Mon Sep 17 00:00:00 2001 From: Vinay R Damodaran Date: Fri, 31 Oct 2025 11:05:50 -0700 Subject: [PATCH 057/976] [Feature] Pydantic validation for scheduler.py and structured_outputs.py (#26519) Signed-off-by: Vinay Damodaran Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/scheduler.py | 62 +++++++++---------- vllm/config/structured_outputs.py | 7 ++- vllm/engine/arg_utils.py | 2 +- .../tool_parsers/minimax_m2_tool_parser.py | 3 +- 4 files changed, 39 insertions(+), 35 deletions(-) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index af47531501cfb..b837b830e774b 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -2,10 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib -from dataclasses import InitVar, field +from collections.abc import Callable +from dataclasses import InitVar from typing import Any, Literal -from pydantic import SkipValidation, model_validator +from pydantic import Field, field_validator, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self @@ -31,28 +32,28 @@ class SchedulerConfig: runner_type: RunnerType = "generate" """The runner type to launch for the model.""" - max_num_batched_tokens: SkipValidation[int] = None # type: ignore + max_num_batched_tokens: int = Field(default=None, ge=1) """Maximum number of tokens to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_num_seqs: SkipValidation[int] = None # type: ignore + max_num_seqs: int = Field(default=None, ge=1) """Maximum number of sequences to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_model_len: SkipValidation[int] = None # type: ignore + max_model_len: int = Field(default=None, ge=1) """Maximum length of a sequence (including prompt and generated text). This is primarily set in `ModelConfig` and that value should be manually duplicated here.""" - max_num_partial_prefills: int = 1 + max_num_partial_prefills: int = Field(default=1, ge=1) """For chunked prefill, the maximum number of sequences that can be partially prefilled concurrently.""" - max_long_partial_prefills: int = 1 + max_long_partial_prefills: int = Field(default=1, ge=1) """For chunked prefill, the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. Setting this less than max_num_partial_prefills will allow shorter prompts to jump @@ -62,7 +63,7 @@ class SchedulerConfig: """For chunked prefill, a request is considered long if the prompt is longer than this number of tokens.""" - num_lookahead_slots: int = 0 + num_lookahead_slots: int = Field(default=0, ge=0) """The number of slots to allocate per sequence per step, beyond the known token ids. This is used in speculative decoding to store KV activations of tokens which may or may not be @@ -71,7 +72,7 @@ class SchedulerConfig: NOTE: This will be replaced by speculative config in the future; it is present to enable correctness tests until then.""" - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore + enable_chunked_prefill: bool = Field(default=None) """If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.""" @@ -86,14 +87,14 @@ class SchedulerConfig: """ # TODO (ywang96): Make this configurable. - max_num_encoder_input_tokens: int = field(init=False) + max_num_encoder_input_tokens: int = Field(init=False) """Multimodal encoder compute budget, only used in V1. NOTE: This is not currently configurable. It will be overridden by max_num_batched_tokens in case max multimodal embedding size is larger.""" # TODO (ywang96): Make this configurable. - encoder_cache_size: int = field(init=False) + encoder_cache_size: int = Field(init=False) """Multimodal encoder cache size, only used in V1. NOTE: This is not currently configurable. It will be overridden by @@ -106,7 +107,7 @@ class SchedulerConfig: - "priority" means requests are handled based on given priority (lower value means earlier handling) and time of arrival deciding any ties).""" - chunked_prefill_enabled: bool = field(init=False) + chunked_prefill_enabled: bool = Field(init=False) """True if chunked prefill is enabled.""" disable_chunked_mm_input: bool = False @@ -155,6 +156,20 @@ class SchedulerConfig: hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str + @field_validator( + "max_num_batched_tokens", + "max_num_seqs", + "max_model_len", + "enable_chunked_prefill", + mode="wrap", + ) + @classmethod + def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: + """Skip validation if the value is `None` when initialisation is delayed.""" + if value is None: + return value + return handler(value) + def __post_init__(self, is_encoder_decoder: bool) -> None: if self.max_model_len is None: self.max_model_len = 8192 @@ -260,19 +275,7 @@ class SchedulerConfig: self.max_num_seqs * self.max_model_len, ) - if self.num_lookahead_slots < 0: - raise ValueError( - "num_lookahead_slots " - f"({self.num_lookahead_slots}) must be greater than or " - "equal to 0." - ) - - if self.max_num_partial_prefills < 1: - raise ValueError( - f"max_num_partial_prefills ({self.max_num_partial_prefills}) " - "must be greater than or equal to 1." - ) - elif self.max_num_partial_prefills > 1: + if self.max_num_partial_prefills > 1: if not self.chunked_prefill_enabled: raise ValueError( "Chunked prefill must be enabled to set " @@ -286,13 +289,10 @@ class SchedulerConfig: f"than the max_model_len ({self.max_model_len})." ) - if (self.max_long_partial_prefills < 1) or ( - self.max_long_partial_prefills > self.max_num_partial_prefills - ): + if self.max_long_partial_prefills > self.max_num_partial_prefills: raise ValueError( - f"max_long_partial_prefills ({self.max_long_partial_prefills}) " - "must be greater than or equal to 1 and less than or equal to " - f"max_num_partial_prefills ({self.max_num_partial_prefills})." + f"{self.max_long_partial_prefills=} must be less than or equal to " + f"{self.max_num_partial_prefills=}." ) return self diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py index 76b565006e286..85b6e42264a42 100644 --- a/vllm/config/structured_outputs.py +++ b/vllm/config/structured_outputs.py @@ -2,8 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib -from typing import Any, Literal +from typing import Any, Literal, Self +from pydantic import model_validator from pydantic.dataclasses import dataclass from vllm.config.utils import config @@ -56,7 +57,8 @@ class StructuredOutputsConfig: hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str - def __post_init__(self): + @model_validator(mode="after") + def _validate_structured_output_config(self) -> Self: if self.disable_any_whitespace and self.backend not in ("xgrammar", "guidance"): raise ValueError( "disable_any_whitespace is only supported for " @@ -67,3 +69,4 @@ class StructuredOutputsConfig: "disable_additional_properties is only supported " "for the guidance backend." ) + return self diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4e2c389bf84d3..b6f922a95519b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1807,7 +1807,7 @@ class EngineArgs: incremental_prefill_supported = ( pooling_type is not None and pooling_type.lower() == "last" - and is_causal + and bool(is_causal) ) action = "Enabling" if incremental_prefill_supported else "Disabling" diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py index 06dd336bf9cf3..d083ece892d50 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py @@ -2,11 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json -import re import uuid from collections.abc import Sequence from typing import Any +import regex as re + from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, DeltaFunctionCall, From f29aeb5a25dad044306684e205adc159949c6ccb Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Fri, 31 Oct 2025 14:12:19 -0400 Subject: [PATCH 058/976] Add FLASHINFER_MLA to test_mla_backends and add B200 CI run (#27663) Signed-off-by: Matthew Bonanni --- .buildkite/test-pipeline.yaml | 10 + tests/v1/attention/test_mla_backends.py | 244 +++++++++++++----- tests/v1/attention/utils.py | 12 +- .../attention/backends/mla/flashinfer_mla.py | 6 +- 4 files changed, 208 insertions(+), 64 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 3bd5bd87fe6f0..a020b0d276be0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -340,6 +340,16 @@ steps: commands: - pytest -v -s v1/attention +- label: V1 Test attention (B200) # 10min + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - export VLLM_DISABLE_FLASHINFER_PREFILL=1 # TODO: FI prefill is bugged and causes incorrectness, fix this + - pytest -v -s v1/attention + - label: V1 Test others (CPU) # 5 mins source_file_dependencies: - vllm/ diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 1b17532884841..cda4fb11c096e 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -14,16 +14,19 @@ import torch from tests.v1.attention.utils import ( BatchSpec, create_common_attn_metadata, - create_standard_kv_cache_spec, create_vllm_config, try_get_attention_backend, ) from vllm import _custom_ops as ops -from vllm.attention.backends.registry import _Backend +from vllm.attention.backends.registry import _Backend, backend_to_class_str from vllm.attention.ops.flashmla import is_flashmla_dense_supported +from vllm.attention.utils.fa_utils import flash_attn_supports_mla from vllm.config.vllm import set_current_vllm_config +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.v1.attention.backends.mla.common import QueryLenSupport from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import FullAttentionSpec @@ -31,17 +34,46 @@ BACKENDS_TO_TEST = [ _Backend.CUTLASS_MLA, _Backend.FLASHMLA, _Backend.FLASH_ATTN_MLA, + _Backend.FLASHINFER_MLA, _Backend.TRITON_MLA, ] -# Remove CUTLASS_MLA from the list if not using sm100 +# Remove sm100 backends from the list if not using sm100 if not torch.cuda.is_available() or torch.cuda.get_device_properties(0).major < 10: BACKENDS_TO_TEST.remove(_Backend.CUTLASS_MLA) + BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_MLA) + +# Remove FLASH_ATTN_MLA from the list if not supported +if not flash_attn_supports_mla(): + BACKENDS_TO_TEST.remove(_Backend.FLASH_ATTN_MLA) # Remove FLASHMLA from the list if not supported if not is_flashmla_dense_supported()[0]: BACKENDS_TO_TEST.remove(_Backend.FLASHMLA) +SPEC_DECODE_BACKENDS = [] +for backend in BACKENDS_TO_TEST: + builder_cls, _ = try_get_attention_backend(backend) + query_len_support = getattr( + builder_cls, "query_len_support", QueryLenSupport.SINGLE_ONLY + ) + if query_len_support != QueryLenSupport.SINGLE_ONLY: + SPEC_DECODE_BACKENDS.append(backend) + +BACKEND_BLOCK_SIZES = {} +for backend in BACKENDS_TO_TEST: + backend_class_str = backend_to_class_str(backend) + backend_class = resolve_obj_by_qualname(backend_class_str) + supported_sizes = backend_class.get_supported_kernel_block_size() + if supported_sizes: + default_size = supported_sizes[0] + block_size = ( + default_size if isinstance(default_size, int) else default_size.base + ) + else: + block_size = 16 + BACKEND_BLOCK_SIZES[backend] = block_size + torch.manual_seed(42) @@ -236,6 +268,26 @@ class MockAttentionLayer: self._q_scale = torch.tensor(1.0, device=device) self._k_scale = torch.tensor(1.0, device=device) self._v_scale = torch.tensor(1.0, device=device) + self._prob_scale = torch.tensor(1.0, device=device) + self._q_scale_float = 1.0 + self._k_scale_float = 1.0 + self._v_scale_float = 1.0 + + def forward(self, *_args, **_kwargs): + raise NotImplementedError + + +class MockMLAAttentionLayer(AttentionLayerBase): + """A mock MLA attention layer for populating static_forward_context.""" + + def __init__(self, impl): + self.impl = impl + + def get_attn_backend(self): + raise NotImplementedError + + def get_kv_cache_spec(self, vllm_config): + raise NotImplementedError def run_attention_backend( @@ -262,13 +314,6 @@ def run_attention_backend( # Set the current vllm config so that get_current_vllm_config() works # in the backend implementations with set_current_vllm_config(vllm_config): - # Build metadata - builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device) - attn_metadata = builder.build( - common_prefix_len=0, - common_attn_metadata=common_attn_metadata, - ) - # Instantiate MLA implementation num_heads = vllm_config.model_config.get_num_attention_heads( vllm_config.parallel_config @@ -302,6 +347,19 @@ def run_attention_backend( act_dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) impl.process_weights_after_loading(act_dtype) + # Populate static_forward_context with mock attention layers + for layer_name in layer_names: + vllm_config.compilation_config.static_forward_context[layer_name] = ( + MockMLAAttentionLayer(impl) + ) + + # Build metadata + builder = builder_cls(kv_cache_spec, layer_names, vllm_config, device) + attn_metadata = builder.build( + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) + # Create mock layer and output buffer mock_layer = MockAttentionLayer(device) num_tokens = query.shape[0] @@ -353,15 +411,14 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): simulated paged KV cache. 5. Comparing the vLLM backend's output to the ground-truth SDPA output. """ - from vllm.v1.attention.backends.mla.common import QueryLenSupport batch_spec = BATCH_SPECS[batch_spec_name] is_spec_decode_test = batch_spec_name.startswith("spec_decode") - spec_decode_backends = {_Backend.FLASH_ATTN_MLA, _Backend.FLASHMLA} - - block_size = 16 + unique_block_sizes = sorted(set(BACKEND_BLOCK_SIZES.values())) + default_block_size = unique_block_sizes[0] required_blocks = sum( - (seq_len + block_size - 1) // block_size for seq_len in batch_spec.seq_lens + (seq_len + default_block_size - 1) // default_block_size + for seq_len in batch_spec.seq_lens ) # Add 1 for null block at index 0, and some buffer num_gpu_blocks = required_blocks + 1 + 100 @@ -370,7 +427,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): model_name=model, max_model_len=max(batch_spec.seq_lens), num_gpu_blocks=num_gpu_blocks, - block_size=block_size, + block_size=default_block_size, ) # For spec decode tests, add a speculative_config to set the reorder_batch_threshold @@ -388,8 +445,6 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): device = torch.device("cuda:0") - kv_cache_spec = create_standard_kv_cache_spec(vllm_config) - # 1. Setup batch_size = batch_spec.batch_size seq_lens = batch_spec.seq_lens @@ -399,7 +454,6 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): ) head_size = vllm_config.model_config.get_head_size() dtype = _convert_dtype_to_torch(vllm_config.model_config.dtype) - block_size = vllm_config.cache_config.block_size kv_lora_rank = 512 qk_rope_head_dim = 64 qk_nope_head_dim = 128 @@ -598,33 +652,83 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): ) mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T, requires_grad=False) - # Create metadata using original batch spec - common_attn_metadata = create_common_attn_metadata( - batch_spec, vllm_config.cache_config.block_size, device - ) + # 3. Create metadata and KV caches for each block size + # Group backends by block size and test each group + metadata_per_block_size = {} + kv_cache_per_block_size = {} - # 3. Simulate Paged KV Cache and a realistic slot_mapping - kv_cache = create_and_prepopulate_kv_cache( - kv_c_contexts=kv_c_contexts, - k_pe_contexts=k_pe_contexts, - block_size=block_size, - head_size=head_size, - dtype=dtype, - device=device, - num_blocks=vllm_config.cache_config.num_gpu_blocks, - common_attn_metadata=common_attn_metadata, - randomize_blocks=True, - ) + for block_size in unique_block_sizes: + # Create metadata for this block size + common_attn_metadata = create_common_attn_metadata( + batch_spec, block_size, device + ) + + # Pad block table to meet requirement: + # block_num % (128 / block_size) == 0 + required_divisor = int(128 / block_size) + current_block_num = common_attn_metadata.block_table_tensor.shape[1] + if current_block_num % required_divisor != 0: + # Pad to next multiple of required_divisor + padded_block_num = ( + (current_block_num + required_divisor - 1) // required_divisor + ) * required_divisor + padding_cols = padded_block_num - current_block_num + padding = torch.zeros( + (common_attn_metadata.block_table_tensor.shape[0], padding_cols), + dtype=torch.int32, + device=device, + ) + common_attn_metadata.block_table_tensor = torch.cat( + [common_attn_metadata.block_table_tensor, padding], dim=1 + ) + + metadata_per_block_size[block_size] = common_attn_metadata + + # Create KV cache for this block size + required_blocks_for_size = sum( + (seq_len + block_size - 1) // block_size for seq_len in batch_spec.seq_lens + ) + num_blocks_for_size = required_blocks_for_size + 1 + 100 + + kv_cache = create_and_prepopulate_kv_cache( + kv_c_contexts=kv_c_contexts, + k_pe_contexts=k_pe_contexts, + block_size=block_size, + head_size=head_size, + dtype=dtype, + device=device, + num_blocks=num_blocks_for_size, + common_attn_metadata=common_attn_metadata, + randomize_blocks=True, + ) + kv_cache_per_block_size[block_size] = kv_cache # 4. Run vLLM backends and compare + failures = [] for backend_idx, backend_name in enumerate(BACKENDS_TO_TEST): # Skip backends that don't support spec decode for spec decode tests - if is_spec_decode_test and backend_name not in spec_decode_backends: + if is_spec_decode_test and backend_name not in SPEC_DECODE_BACKENDS: continue + # Get the appropriate block_size, metadata, and cache for this backend + block_size = BACKEND_BLOCK_SIZES[backend_name] + common_attn_metadata = metadata_per_block_size[block_size] + kv_cache = kv_cache_per_block_size[block_size] + + # Create kv_cache_spec with the correct block_size for this backend + backend_kv_cache_spec = FullAttentionSpec( + block_size=block_size, + num_kv_heads=vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config + ), + head_size=vllm_config.model_config.get_head_size(), + dtype=vllm_config.model_config.dtype, + sliding_window=vllm_config.model_config.get_sliding_window(), + ) + backend_output = run_attention_backend( backend_name, - kv_cache_spec, + backend_kv_cache_spec, ["placeholder"], vllm_config, device, @@ -644,32 +748,48 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): expected_output = sdpa_outputs[backend_name] # Check shape and dtype consistency - assert backend_output.shape == expected_output.shape, ( - f"[{backend_name}] shape {backend_output.shape} != " - f"SDPA shape {expected_output.shape}" - ) - assert backend_output.dtype == expected_output.dtype, ( - f"[{backend_name}] dtype {backend_output.dtype} != " - f"SDPA dtype {expected_output.dtype}" - ) + try: + assert backend_output.shape == expected_output.shape, ( + f"[{backend_name}] shape {backend_output.shape} != " + f"SDPA shape {expected_output.shape}" + ) + assert backend_output.dtype == expected_output.dtype, ( + f"[{backend_name}] dtype {backend_output.dtype} != " + f"SDPA dtype {expected_output.dtype}" + ) - assert torch.isfinite(backend_output).all(), ( - f"[{backend_name}] produced non-finite values" - ) + assert torch.isfinite(backend_output).all(), ( + f"[{backend_name}] produced non-finite values" + ) - # Check numerical similarity - rtol = 1e-2 - atol = 5e-1 + # Check numerical similarity + rtol = 1e-2 + atol = 5e-1 - max_diff = torch.max(torch.abs(backend_output - expected_output)).item() - max_rel_diff = torch.max( - torch.abs(backend_output - expected_output) / torch.abs(expected_output) - ).item() - all_close = torch.allclose( - backend_output, expected_output, rtol=rtol, atol=atol - ) + max_diff = torch.max(torch.abs(backend_output - expected_output)).item() + max_rel_diff = torch.max( + torch.abs(backend_output - expected_output) / torch.abs(expected_output) + ).item() + all_close = torch.allclose( + backend_output, expected_output, rtol=rtol, atol=atol + ) - assert all_close, ( - f"[{backend_name}] output differs from SDPA baseline. " - f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})" - ) + assert all_close, ( + f"[{backend_name}] output differs from SDPA baseline. " + f"Max diff: {max_diff:.6f}, max rel diff: {max_rel_diff:.6f})" + ) + except AssertionError as e: + failures.append(str(e)) + + # Report all failures at once + if failures: + # Create a summary for the single-line failure message + backend_names = [] + for f in failures: + if "[_Backend." in f: + backend_name = f.split("[")[1].split("]")[0] + backend_names.append(backend_name) + + summary = f"{len(failures)} backend(s) failed: {', '.join(backend_names)}" + detailed_msg = "\n".join(failures) + pytest.fail(f"{summary}\n{detailed_msg}") diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 15ed7bdc835bb..b166d9d4ff688 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -285,7 +285,17 @@ full_cg_backend_configs = { name="CutlassMLA", env_vars={ "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA", - "FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed + }, + comp_config={ + "cudagraph_mode": "FULL_AND_PIECEWISE", + }, + specific_gpu_arch=(10, 0), + ), + # FlashInfer MLA on Blackwell + "FlashInferMLA": BackendConfig( + name="FlashInferMLA", + env_vars={ + "VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA", }, comp_config={ "cudagraph_mode": "FULL_AND_PIECEWISE", diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index 44807c39cad30..ebbcfd0eaa2fb 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -6,7 +6,7 @@ from typing import ClassVar import torch from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla -from vllm.attention.backends.abstract import AttentionLayer, AttentionType +from vllm.attention.backends.abstract import AttentionLayer, AttentionType, MultipleOf from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, @@ -40,6 +40,10 @@ class FlashInferMLABackend(MLACommonBackend): def get_builder_cls() -> type["FlashInferMLAMetadataBuilder"]: return FlashInferMLAMetadataBuilder + @classmethod + def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]: + return [32, 64] + g_fi_workspace = torch.zeros( FLASHINFER_MLA_WORKSPACE_BUFFER_SIZE, From 0e0a638c3b1e239ec4eaee5b4c15808768689eb0 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Fri, 31 Oct 2025 17:22:19 -0400 Subject: [PATCH 059/976] Batch invariance doc (#27839) Signed-off-by: Bram Wasti Signed-off-by: Bram Wasti Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/features/batch_invariance.md | 133 ++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 docs/features/batch_invariance.md diff --git a/docs/features/batch_invariance.md b/docs/features/batch_invariance.md new file mode 100644 index 0000000000000..b196db9d9c25c --- /dev/null +++ b/docs/features/batch_invariance.md @@ -0,0 +1,133 @@ +# Batch Invariance + +!!! note + Batch invariance is currently in beta. Some features are still under active development. + Track progress and planned improvements at + +This document shows how to enable batch invariance in vLLM. Batch invariance ensures that the output of a model is deterministic and independent of the batch size or the order of requests in a batch. + +## Motivation + +Batch invariance is crucial for several use cases: + +- **Framework debugging**: Deterministic outputs make it easier to debug issues in the inference framework, as the same input will always produce the same output regardless of batching. +- **Model debugging**: Helps identify issues in model implementations by ensuring consistent behavior across different batch configurations. +- **Reinforcement Learning (RL)**: RL training often requires deterministic rollouts for reproducibility and stable training. +- **Large-scale inference systems**: Systems that use vLLM as a component benefit from deterministic behavior for testing, validation, and consistency guarantees. + +## Hardware Requirements + +Batch invariance currently requires NVIDIA GPUs with compute capability 9.0 or higher: + +- **H-series**: H100, H200 +- **B-series**: B100, B200 + +## Enabling Batch Invariance + +Batch invariance can be enabled by setting the `VLLM_BATCH_INVARIANT` environment variable to `1`: + +```bash +export VLLM_BATCH_INVARIANT=1 +``` + +### Online Inference (Server Mode) + +To start a vLLM server with batch invariance enabled: + +```bash +VLLM_BATCH_INVARIANT=1 vllm serve meta-llama/Llama-3.1-8B-Instruct +``` + +Then use the OpenAI-compatible client: + +```python +from openai import OpenAI + +client = OpenAI( + api_key="EMPTY", + base_url="http://localhost:8000/v1", +) + +# These requests will produce deterministic outputs +# regardless of batch size or order +response = client.completions.create( + model="meta-llama/Llama-3.1-8B-Instruct", + prompt="The future of AI is", + max_tokens=100, + temperature=0.7, + seed=42, +) + +print(response.choices[0].text) +``` + +### Offline Inference + +For offline batch inference with batch invariance: + +```python +import os +os.environ["VLLM_BATCH_INVARIANT"] = "1" + +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", + "Machine learning enables", + "Deep learning models can", +] + +sampling_params = SamplingParams( + temperature=0.7, + top_p=0.95, + max_tokens=100, + seed=42, +) + +llm = LLM( + model="meta-llama/Llama-3.1-8B-Instruct", + tensor_parallel_size=1, +) + +# Outputs will be deterministic regardless of batch size +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Generated: {generated_text!r}\n") +``` + +## Tested Models + +Batch invariance has been tested and verified on the following models: + +- **DeepSeek series**: `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-V3-0324`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1` +- **Qwen3 (Dense)**: `Qwen/Qwen3-1.7B`, `Qwen/Qwen3-8B` +- **Qwen3 (MoE)**: `Qwen/Qwen3-30B-A3B`, `Qwen/Qwen3-Next-80B-A3B-Instruct` +- **Llama 3**: `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct` + +Other models may also work, but these have been explicitly validated. If you encounter issues with a specific model, please report them on the [GitHub issue tracker](https://github.com/vllm-project/vllm/issues/new/choose). + +## Implementation Details + +When batch invariance is enabled, vLLM: + +1. Uses deterministic kernel implementations for attention and other operations +2. Ensures consistent numerical behavior across different batch sizes +3. Disables certain optimizations that may introduce non-determinism (such as custom all-reduce operations in tensor parallel mode) + +!!! note + Enabling batch invariance may impact performance compared to the default non-deterministic mode. This trade-off is intentional to guarantee reproducibility. + +## Future Improvements + +The batch invariance feature is under active development. Planned improvements include: + +- Support for additional GPU architectures +- Expanded model coverage +- Performance optimizations +- Additional testing and validation + +For the latest status and to contribute ideas, see the [tracking issue](https://github.com/vllm-project/vllm/issues/27433). From df334868ca5b7d8785121f8eaf52b1526e7766ac Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Fri, 31 Oct 2025 14:30:28 -0700 Subject: [PATCH 060/976] [Hybrid] A simpler algorithm to find kernel_block_size (#26476) Signed-off-by: Chen Zhang --- tests/v1/worker/test_gpu_model_runner.py | 53 +++++++ vllm/v1/worker/gpu_model_runner.py | 169 ++++++++++++----------- vllm/v1/worker/utils.py | 6 +- 3 files changed, 146 insertions(+), 82 deletions(-) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 9007436350be4..23ab70480fbb3 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -6,6 +6,7 @@ import pytest import torch from vllm.attention import Attention +from vllm.attention.backends.abstract import MultipleOf from vllm.config import ( CacheConfig, ModelConfig, @@ -34,6 +35,7 @@ from vllm.v1.kv_cache_interface import ( from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner +from vllm.v1.worker.utils import AttentionGroup BLOCK_SIZE = 16 NUM_BLOCKS = 10 @@ -181,6 +183,57 @@ def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: ).all() +def _make_mock_backend_for_kernel_block_size( + supported_sizes: list[int | MultipleOf], +): + class _MockBackend: + @staticmethod + def get_supported_kernel_block_size(): + return supported_sizes + + return _MockBackend() + + +def _make_kv_cache_spec() -> FullAttentionSpec: + return FullAttentionSpec(block_size=1, num_kv_heads=1, head_size=1, dtype="float16") + + +def test_select_common_block_size_prefers_manager_block_size(): + backend_a = _make_mock_backend_for_kernel_block_size([MultipleOf(32)]) + backend_b = _make_mock_backend_for_kernel_block_size([64, MultipleOf(16)]) + attn_groups = [ + AttentionGroup(backend_a, [], [], _make_kv_cache_spec(), 0), + AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0), + ] + + selected_size = GPUModelRunner.select_common_block_size(128, attn_groups) + assert selected_size == 128 + + +def test_select_common_block_size_uses_largest_shared_int(): + backend_a = _make_mock_backend_for_kernel_block_size([128, 64]) + backend_b = _make_mock_backend_for_kernel_block_size([64, 32]) + attn_groups = [ + AttentionGroup(backend_a, [], [], _make_kv_cache_spec(), 0), + AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0), + ] + + selected_size = GPUModelRunner.select_common_block_size(256, attn_groups) + assert selected_size == 64 + + +def test_select_common_block_size_no_valid_option(): + backend_a = _make_mock_backend_for_kernel_block_size([64]) + backend_b = _make_mock_backend_for_kernel_block_size([MultipleOf(16)]) + attn_groups = [ + AttentionGroup(backend_a, [], [], _make_kv_cache_spec(), 0), + AttentionGroup(backend_b, [], [], _make_kv_cache_spec(), 0), + ] + + with pytest.raises(ValueError): + GPUModelRunner.select_common_block_size(48, attn_groups) + + def test_update_states_new_request(model_runner, dist_init): req_id = "req_0" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 747a7b377e401..ba852bb89f33d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3978,6 +3978,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def create_attn_groups( attn_backends_map: dict[AttentionGroupKey, list[str]], + kv_cache_group_id: int, ) -> list[AttentionGroup]: attn_groups: list[AttentionGroup] = [] for (attn_backend, kv_cache_spec), layer_names in attn_backends_map.items(): @@ -3987,6 +3988,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_cache_spec, self.vllm_config, self.device, + kv_cache_group_id, num_metadata_builders=1 if not self.parallel_config.enable_dbo else 2, @@ -4005,8 +4007,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Resolve cudagraph_mode before actually initialize metadata_builders self._check_and_update_cudagraph_mode(attention_backend_set) - for attn_backends_map in attention_backend_maps: - self.attn_groups.append(create_attn_groups(attn_backends_map)) + for i, attn_backend_map in enumerate(attention_backend_maps): + self.attn_groups.append(create_attn_groups(attn_backend_map, i)) # Calculate reorder batch threshold (if needed) self.calculate_reorder_batch_threshold() @@ -4156,87 +4158,81 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds) - def _find_compatible_block_sizes( - self, - kv_manager_block_size: int, - backend_cls: type[AttentionBackend], - return_all: bool = False, - ) -> list[int]: - """ - Find compatible block sizes for a backend. - - Args: - kv_manager_block_size: Physical block size of KV cache - backend_cls: Attention backend class - return_all: Return all compatible sizes if True, max size if False - - Returns: - Compatible block size(s) based on return_all parameter - - Raises: - ValueError: If no compatible block size found - """ - supported_block_size = backend_cls.get_supported_kernel_block_size() - compatible_sizes = [] - - for block_size in supported_block_size: - if isinstance(block_size, int): - if kv_manager_block_size % block_size == 0: - compatible_sizes.append(block_size) - elif ( - isinstance(block_size, MultipleOf) - and kv_manager_block_size % block_size.base == 0 - ): - compatible_sizes.append(kv_manager_block_size) - - if not compatible_sizes: - raise ValueError(f"No compatible block size for {kv_manager_block_size}") - - return compatible_sizes if return_all else [max(compatible_sizes)] - - def _select_common_block_size( - self, kv_manager_block_size: int, attn_groups: list[AttentionGroup] + @staticmethod + def select_common_block_size( + kv_manager_block_size: int, attn_groups: list[AttentionGroup] ) -> int: """ - Select common block size for all backends. + Select a block size that is supported by all backends and is a factor of + kv_manager_block_size. + + If kv_manager_block_size is supported by all backends, return it directly. + Otherwise, return the max supported size. Args: kv_manager_block_size: Block size of KV cache attn_groups: List of attention groups Returns: - Block size supported by all backends, - prioritizing cache_config.block_size + The selected block size Raises: - ValueError: If no common block size found + ValueError: If no valid block size found """ - all_backend_supports = [] - for attn_group in attn_groups: - compatible_sizes = self._find_compatible_block_sizes( - kv_manager_block_size, attn_group.backend, return_all=True - ) - supported_sizes = sorted(list(set(compatible_sizes)), reverse=True) - all_backend_supports.append(set(supported_sizes)) + def block_size_is_supported( + backends: list[type[AttentionBackend]], block_size: int + ) -> bool: + """ + Check if the block size is supported by all backends. + """ + for backend in backends: + is_supported = False + for supported_size in backend.get_supported_kernel_block_size(): + if isinstance(supported_size, int): + if block_size == supported_size: + is_supported = True + elif isinstance(supported_size, MultipleOf): + if block_size % supported_size.base == 0: + is_supported = True + else: + raise ValueError(f"Unknown supported size: {supported_size}") + if not is_supported: + return False + return True - common_supported_sizes = set.intersection(*all_backend_supports) + backends = [group.backend for group in attn_groups] - if not common_supported_sizes: - error_msg = f"No common block size for {kv_manager_block_size}. " - for i, attn_group in enumerate(attn_groups): - supported = all_backend_supports[i] - error_msg += ( - f"Backend {attn_group.backend} supports: {sorted(supported)}. " - ) - raise ValueError(error_msg) + # Case 1: if the block_size of kv cache manager is supported by all backends, + # return it directly + if block_size_is_supported(backends, kv_manager_block_size): + return kv_manager_block_size - if self.cache_config.block_size in common_supported_sizes: - return self.cache_config.block_size + # Case 2: otherwise, the block_size must be an `int`-format supported size of + # at least one backend. Iterate over all `int`-format supported sizes in + # descending order and return the first one that is supported by all backends. + # Simple proof: + # If the supported size b is in MultipleOf(x_i) format for all attention + # backends i, and b a factor of kv_manager_block_size, then + # kv_manager_block_size also satisfies MultipleOf(x_i) for all i. We will + # return kv_manager_block_size in case 1. + all_int_supported_sizes = set( + supported_size + for backend in backends + for supported_size in backend.get_supported_kernel_block_size() + if isinstance(supported_size, int) + ) - return max(common_supported_sizes) + for supported_size in sorted(all_int_supported_sizes, reverse=True): + if kv_manager_block_size % supported_size != 0: + continue + if block_size_is_supported(backends, supported_size): + return supported_size + raise ValueError(f"No common block size for {kv_manager_block_size}. ") - def may_reinitialize_input_batch(self, kv_cache_config: KVCacheConfig) -> None: + def may_reinitialize_input_batch( + self, kv_cache_config: KVCacheConfig, kernel_block_sizes: list[int] + ) -> None: """ Re-initialize the input batch if the block sizes are different from `[self.cache_config.block_size]`. This usually happens when there @@ -4244,6 +4240,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): Args: kv_cache_config: The KV cache configuration. + kernel_block_sizes: The kernel block sizes for each KV cache group. """ block_sizes = [ kv_cache_group.kv_cache_spec.block_size @@ -4251,9 +4248,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if not isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec) ] - # Generate kernel_block_sizes that matches each block_size - kernel_block_sizes = self._prepare_kernel_block_sizes(kv_cache_config) - if block_sizes != [self.cache_config.block_size] or kernel_block_sizes != [ self.cache_config.block_size ]: @@ -4354,7 +4348,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # all backends in the group. attn_groups = self.attn_groups[kv_cache_group_id] kv_manager_block_size = kv_cache_group.kv_cache_spec.block_size - selected_kernel_size = self._select_common_block_size( + selected_kernel_size = self.select_common_block_size( kv_manager_block_size, attn_groups ) kernel_block_sizes.append(selected_kernel_size) @@ -4372,6 +4366,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self, kv_cache_config: KVCacheConfig, kv_cache_raw_tensors: dict[str, torch.Tensor], + kernel_block_sizes: list[int], ) -> dict[str, torch.Tensor]: """ Reshape the KV cache tensors to the desired shape and dtype. @@ -4380,6 +4375,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_cache_config: The KV cache config kv_cache_raw_tensors: The KV cache buffer of each layer, with correct size but uninitialized shape. + kernel_block_sizes: The kernel block sizes for each KV cache group. Returns: Dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. @@ -4389,6 +4385,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): for group in self._kv_cache_spec_attn_group_iterator(): kv_cache_spec = group.kv_cache_spec attn_backend = group.backend + if group.kv_cache_group_id == len(kernel_block_sizes): + # There may be a last group for layers without kv cache. + continue + kernel_block_size = kernel_block_sizes[group.kv_cache_group_id] for layer_name in group.layer_names: if layer_name in self.runner_only_attn_layers: continue @@ -4397,24 +4397,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_blocks = raw_tensor.numel() // kv_cache_spec.page_size_bytes if isinstance(kv_cache_spec, AttentionSpec): has_attn = True - kv_manager_block_size = kv_cache_spec.block_size - kernel_size_list = self._find_compatible_block_sizes( - kv_manager_block_size, attn_backend, return_all=False + num_blocks_per_kv_block = ( + kv_cache_spec.block_size // kernel_block_size ) - kernel_size = kernel_size_list[0] - num_blocks_per_kv_block = kv_manager_block_size // kernel_size kernel_num_blocks = num_blocks * num_blocks_per_kv_block kv_cache_shape = attn_backend.get_kv_cache_shape( kernel_num_blocks, - kernel_size, + kernel_block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size, cache_dtype_str=self.cache_config.cache_dtype, ) dtype = kv_cache_spec.dtype try: - kv_cache_stride_order = attn_backend.get_kv_cache_stride_order() # noqa: E501 + kv_cache_stride_order = attn_backend.get_kv_cache_stride_order() assert len(kv_cache_stride_order) == len(kv_cache_shape) except (AttributeError, NotImplementedError): kv_cache_stride_order = tuple(range(len(kv_cache_shape))) @@ -4497,13 +4494,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) def initialize_kv_cache_tensors( - self, kv_cache_config: KVCacheConfig + self, kv_cache_config: KVCacheConfig, kernel_block_sizes: list[int] ) -> dict[str, torch.Tensor]: """ Initialize the memory buffer for KV cache. Args: kv_cache_config: The KV cache config + kernel_block_sizes: The kernel block sizes for each KV cache group. + Returns: Dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. @@ -4512,7 +4511,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config) # Change the memory buffer to the desired shape kv_caches = self._reshape_kv_cache_tensors( - kv_cache_config, kv_cache_raw_tensors + kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes ) # Set up cross-layer KV cache sharing @@ -4571,9 +4570,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.may_add_encoder_only_layers_to_kv_cache_config() self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config) self.initialize_attn_backend(kv_cache_config) + # The kernel block size for all KV cache groups. For example, if + # kv_cache_manager uses block_size 256 for a given group, but the attention + # backends for that group only supports block_size 64, we will return + # kernel_block_size 64 and split the 256-token-block to 4 blocks with 64 + # tokens each. + kernel_block_sizes = self._prepare_kernel_block_sizes(kv_cache_config) # Reinitialize need to after initialize_attn_backend - self.may_reinitialize_input_batch(kv_cache_config) - kv_caches = self.initialize_kv_cache_tensors(kv_cache_config) + self.may_reinitialize_input_batch(kv_cache_config, kernel_block_sizes) + kv_caches = self.initialize_kv_cache_tensors( + kv_cache_config, kernel_block_sizes + ) if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 92baf0cb71368..396adbcfb289f 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -140,6 +140,7 @@ class AttentionGroup: metadata_builders: list[AttentionMetadataBuilder] layer_names: list[str] kv_cache_spec: KVCacheSpec + kv_cache_group_id: int @staticmethod def create_with_metadata_builders( @@ -148,13 +149,16 @@ class AttentionGroup: kv_cache_spec: KVCacheSpec, vllm_config: VllmConfig, device: torch.device, + kv_cache_group_id: int, num_metadata_builders: int = 1, ) -> "AttentionGroup": metadata_builders = [ backend.get_builder_cls()(kv_cache_spec, layer_names, vllm_config, device) for _ in range(num_metadata_builders) ] - return AttentionGroup(backend, metadata_builders, layer_names, kv_cache_spec) + return AttentionGroup( + backend, metadata_builders, layer_names, kv_cache_spec, kv_cache_group_id + ) def get_metadata_builder(self, ubatch_id: int = 0) -> AttentionMetadataBuilder: assert len(self.metadata_builders) > ubatch_id From 0cdbe7b744b7d3a46dc2443cd16b5ed3465e6776 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 31 Oct 2025 17:35:04 -0700 Subject: [PATCH 061/976] [Core] Async scheduling + structured outputs compatibility (#26866) Signed-off-by: Nick Hill --- tests/conftest.py | 3 + tests/v1/core/test_scheduler.py | 9 --- ...nd_preempt.py => test_async_scheduling.py} | 14 +++- tests/v1/engine/test_engine_core.py | 19 ++++- tests/v1/executor/test_executor.py | 4 +- .../unit/test_kv_connector_lifecyle.py | 2 - .../kv_connector/unit/test_nixl_connector.py | 4 +- tests/v1/tpu/worker/test_tpu_model_runner.py | 12 --- tests/v1/worker/test_gpu_model_runner.py | 12 --- .../kv_transfer/kv_connector/utils.py | 30 ++++--- vllm/v1/core/sched/async_scheduler.py | 8 ++ vllm/v1/core/sched/interface.py | 8 +- vllm/v1/core/sched/output.py | 17 ++-- vllm/v1/core/sched/scheduler.py | 31 +++----- vllm/v1/engine/core.py | 71 +++++++++++++---- vllm/v1/executor/abstract.py | 36 ++++++--- vllm/v1/executor/multiproc_executor.py | 43 ++++++---- vllm/v1/executor/ray_executor.py | 37 ++++++++- vllm/v1/executor/ray_utils.py | 35 ++++---- vllm/v1/structured_output/utils.py | 37 +++++---- vllm/v1/worker/gpu_model_runner.py | 79 ++++++++++++++++--- vllm/v1/worker/gpu_worker.py | 17 ++-- vllm/v1/worker/tpu_model_runner.py | 45 ++++++++--- vllm/v1/worker/tpu_worker.py | 13 ++- vllm/v1/worker/worker_base.py | 24 +++++- 25 files changed, 419 insertions(+), 191 deletions(-) rename tests/v1/e2e/{test_async_sched_and_preempt.py => test_async_scheduling.py} (91%) diff --git a/tests/conftest.py b/tests/conftest.py index 91155a72b16ca..41fda04a6c92d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,9 @@ from copy import deepcopy from tblib import pickling_support +# Import fixture +from tests.v1.entrypoints.conftest import sample_json_schema # noqa + # ruff: noqa # Install support for pickling exceptions so that we can nicely propagate diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index fba5772396829..92e3831b9c7a6 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -337,8 +337,6 @@ def test_stop_via_update_from_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) model_output = ModelRunnerOutput( @@ -385,8 +383,6 @@ def test_stop_via_update_from_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) model_output = ModelRunnerOutput( @@ -431,8 +427,6 @@ def test_stop_via_update_from_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) model_output = ModelRunnerOutput( @@ -472,8 +466,6 @@ def test_stop_via_update_from_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) model_output = ModelRunnerOutput( @@ -1988,7 +1980,6 @@ def test_schedule_skip_tokenizer_init(): scheduler.add_request(request) output = scheduler.schedule() assert len(output.scheduled_new_reqs) == len(requests) - assert output.grammar_bitmask is None def test_schedule_skip_tokenizer_init_structured_output_request(): diff --git a/tests/v1/e2e/test_async_sched_and_preempt.py b/tests/v1/e2e/test_async_scheduling.py similarity index 91% rename from tests/v1/e2e/test_async_sched_and_preempt.py rename to tests/v1/e2e/test_async_scheduling.py index 15a1cc2558177..444afd5196dd8 100644 --- a/tests/v1/e2e/test_async_sched_and_preempt.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -7,6 +7,7 @@ import torch._dynamo.config as dynamo_config from vllm import SamplingParams from vllm.logprobs import Logprob +from vllm.sampling_params import StructuredOutputsParams from ...conftest import VllmRunner from ...models.utils import check_outputs_equal @@ -15,9 +16,12 @@ MODEL = "Qwen/Qwen3-0.6B" @dynamo_config.patch(cache_size_limit=16) -def test_preempt_and_async_scheduling_e2e(monkeypatch: pytest.MonkeyPatch): +def test_preempt_and_async_scheduling_e2e( + sample_json_schema, monkeypatch: pytest.MonkeyPatch +): """Test consistency of combos of async scheduling, preemption, - uni/multiproc executor, and various sampling parameters.""" + uni/multiproc executor, and various sampling parameters + including structured outputs.""" first_prompt = ( "The following numbers of the sequence " @@ -35,6 +39,12 @@ def test_preempt_and_async_scheduling_e2e(monkeypatch: pytest.MonkeyPatch): dict(bad_words=["the", " the"]), dict(logprobs=2), dict(logprobs=2, presence_penalty=-1.0), + dict(structured_outputs=StructuredOutputsParams(json=sample_json_schema)), + dict( + structured_outputs=StructuredOutputsParams(json=sample_json_schema), + logprobs=2, + presence_penalty=-1.0, + ), ] default_params = dict( diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index becedb59f644d..534b60312fd19 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -248,7 +248,7 @@ def test_engine_core_concurrent_batches(): self, scheduler_output, non_block=False, - ) -> Future[ModelRunnerOutput]: + ) -> Future[ModelRunnerOutput | None]: """Make execute_model non-blocking.""" # DummyExecutor used only for testing async case. @@ -263,6 +263,23 @@ def test_engine_core_concurrent_batches(): # Use the thread pool instead of creating a new thread return self.thread_pool.submit(_execute) + def sample_tokens( + self, grammar_output, non_block=False + ) -> Future[ModelRunnerOutput]: + """Make sample_tokens non-blocking.""" + + # DummyExecutor used only for testing async case. + assert non_block + + def _execute(): + output = self.collective_rpc("sample_tokens", args=(grammar_output,)) + # Make a copy because output[0] may be reused + # by the next batch. + return copy.deepcopy(output[0]) + + # Use the thread pool instead of creating a new thread + return self.thread_pool.submit(_execute) + @property def max_concurrent_batches(self) -> int: return 2 diff --git a/tests/v1/executor/test_executor.py b/tests/v1/executor/test_executor.py index 7293ad09a7176..56574124b2727 100644 --- a/tests/v1/executor/test_executor.py +++ b/tests/v1/executor/test_executor.py @@ -31,7 +31,9 @@ class CustomMultiprocExecutor(MultiprocExecutor): # Drop marker to show that this was run with open(".marker", "w"): ... - return super().collective_rpc(method, timeout, args, kwargs) + return super().collective_rpc( + method, timeout, args, kwargs, non_block, unique_reply_rank + ) CustomMultiprocExecutorAsync = CustomMultiprocExecutor diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py index b5c8f378be182..d0a6eeae6286d 100644 --- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py +++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py @@ -26,8 +26,6 @@ def _make_empty_scheduler_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, kv_connector_metadata=SharedStorageConnectorMetadata(), ) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 44d8b3e331fdb..1f3fdafc644d8 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -981,9 +981,7 @@ def test_scheduler_kv_connector_stats_aggregation(): scheduled_encoder_inputs={}, num_common_prefix_blocks=[0], finished_req_ids=set(), - free_encoder_mm_hashes=set(), - structured_output_request_ids={}, - grammar_bitmask=None, + free_encoder_mm_hashes=[], ) engine_core_outputs = scheduler.update_from_output(scheduler_output, model_output) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 18aa599f1aaf7..7b3a07b4e12a5 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -92,8 +92,6 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) @@ -171,8 +169,6 @@ def test_update_states_request_finished(model_runner): num_common_prefix_blocks=[], finished_req_ids={req_id}, free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) model_runner._update_states(scheduler_output) @@ -201,8 +197,6 @@ def test_update_states_request_resumed(model_runner): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) model_runner._update_states(scheduler_output) @@ -230,8 +224,6 @@ def test_update_states_request_resumed(model_runner): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) model_runner._update_states(scheduler_output) @@ -261,8 +253,6 @@ def test_update_states_no_changes(model_runner): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) model_runner._update_states(scheduler_output) @@ -296,8 +286,6 @@ def test_update_states_request_unscheduled(model_runner): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) model_runner._update_states(scheduler_output) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 23ab70480fbb3..db0215511d322 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -152,8 +152,6 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) @@ -269,8 +267,6 @@ def test_update_states_request_finished(model_runner, dist_init): num_common_prefix_blocks=[], finished_req_ids={req_id}, free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) metadata_before = model_runner.input_batch.sampling_metadata @@ -301,8 +297,6 @@ def test_update_states_request_resumed(model_runner, dist_init): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) model_runner._update_states(scheduler_output) @@ -330,8 +324,6 @@ def test_update_states_request_resumed(model_runner, dist_init): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) metadata_before = model_runner.input_batch.sampling_metadata @@ -423,8 +415,6 @@ def test_update_states_no_changes(model_runner, dist_init): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) metadata_before = model_runner.input_batch.sampling_metadata @@ -460,8 +450,6 @@ def test_update_states_request_unscheduled(model_runner, dist_init): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - structured_output_request_ids=[], - grammar_bitmask=None, ) metadata_before = model_runner._update_states(scheduler_output) diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 22af489a89b99..7464f8469c3b5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -6,7 +6,7 @@ KV cache helper for store. from collections.abc import Sequence from concurrent.futures import CancelledError, Future -from typing import TYPE_CHECKING, Literal, cast +from typing import TYPE_CHECKING, Literal import torch @@ -138,8 +138,11 @@ class KVOutputAggregator: return cls(connector.get_finished_count() or world_size) def aggregate( - self, outputs: list[ModelRunnerOutput], output_rank: int = 0 - ) -> ModelRunnerOutput: + self, outputs: list[ModelRunnerOutput | None], output_rank: int = 0 + ) -> ModelRunnerOutput | None: + if not outputs[output_rank]: + return None + # Aggregate kv_connector_output from all workers def update_finished_set( @@ -161,6 +164,7 @@ class KVOutputAggregator: aggregated_kv_connector_stats = None invalid_block_ids = set[int]() for model_runner_output in outputs: + assert model_runner_output is not None kv_output = model_runner_output.kv_connector_output if not kv_output: continue @@ -204,6 +208,7 @@ class KVOutputAggregator: # select output of the worker specified by output_rank output = outputs[output_rank] + assert output is not None output.kv_connector_output = KVConnectorOutput( finished_sending=finished_sending or None, finished_recving=finished_recving or None, @@ -215,13 +220,16 @@ class KVOutputAggregator: return output def async_aggregate( - self, output_futures: Sequence[Future[ModelRunnerOutput]], output_rank: int = 0 - ) -> Future[ModelRunnerOutput]: + self, + output_futures: Sequence[Future[ModelRunnerOutput | None]], + output_rank: int = 0, + ) -> Future[ModelRunnerOutput | None]: """Takes a list of futures and returns a single future which resolves to the respective list of outputs.""" - result_future: Future[ModelRunnerOutput] = Future() + result_future: Future[ModelRunnerOutput | None] = Future() outputs: list[ModelRunnerOutput | None] = [None] * len(output_futures) + remaining = len(output_futures) def make_callback(idx): def callback(fut): @@ -236,12 +244,10 @@ class KVOutputAggregator: result_future.set_exception(e) # this check assumes io_thread_pool uses a single thread - if all(outputs): - result_future.set_result( - self.aggregate( - cast(list[ModelRunnerOutput], outputs), output_rank - ) - ) + nonlocal remaining + remaining -= 1 + if not remaining: + result_future.set_result(self.aggregate(outputs, output_rank)) return callback diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py index da6e4aa2996bb..0ad994c360b01 100644 --- a/vllm/v1/core/sched/async_scheduler.py +++ b/vllm/v1/core/sched/async_scheduler.py @@ -15,8 +15,12 @@ class AsyncScheduler(Scheduler): scheduler_output: SchedulerOutput, ) -> None: super()._update_after_schedule(scheduler_output) + pending_structured_output_tokens = False for req_id in scheduler_output.num_scheduled_tokens: request = self.requests[req_id] + pending_structured_output_tokens |= ( + request.use_structured_output and request.num_output_placeholders > 0 + ) if ( request.num_computed_tokens == request.num_tokens + request.num_output_placeholders @@ -25,6 +29,10 @@ class AsyncScheduler(Scheduler): # TODO(woosuk): Support speculative decoding. request.num_output_placeholders += 1 + scheduler_output.pending_structured_output_tokens = ( + pending_structured_output_tokens + ) + def _update_request_with_output( self, request: Request, diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py index c36483203343d..291d33c9bf989 100644 --- a/vllm/v1/core/sched/interface.py +++ b/vllm/v1/core/sched/interface.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 - from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.engine import EngineCoreOutputs from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput @@ -40,6 +40,12 @@ class SchedulerInterface(ABC): """ raise NotImplementedError + @abstractmethod + def get_grammar_bitmask( + self, scheduler_output: "SchedulerOutput" + ) -> "GrammarOutput | None": + raise NotImplementedError + @abstractmethod def update_from_output( self, diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index cc6b89e2bf3f1..866136648bcba 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -181,12 +181,17 @@ class SchedulerOutput: # freed from the encoder cache. free_encoder_mm_hashes: list[str] - # ids of structured outputs requests included in the bitmask, in the - # same order as the corresponding stacked rows of the bitmask. - # There may be more than one row per request in the case of speculative decoding. - structured_output_request_ids: list[str] - # the bitmask for the whole batch - grammar_bitmask: "npt.NDArray[np.int32] | None" + # Whether the scheduled requests have all the output tokens they + # need to perform grammar bitmask computation. + pending_structured_output_tokens: bool = False # KV Cache Connector metadata. kv_connector_metadata: KVConnectorMetadata | None = None + + +@dataclass +class GrammarOutput: + # ids of structured output requests. + structured_output_request_ids: list[str] + # Bitmask ordered as structured_output_request_ids. + grammar_bitmask: "npt.NDArray[np.int32]" diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 98c8f08b0aae8..f51744eb2640b 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -5,7 +5,7 @@ import itertools import time from collections import defaultdict from collections.abc import Iterable -from typing import TYPE_CHECKING, Any +from typing import Any from vllm.config import VllmConfig from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch @@ -24,7 +24,12 @@ from vllm.v1.core.encoder_cache_manager import ( ) from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager from vllm.v1.core.sched.interface import SchedulerInterface -from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput +from vllm.v1.core.sched.output import ( + CachedRequestData, + GrammarOutput, + NewRequestData, + SchedulerOutput, +) from vllm.v1.core.sched.request_queue import SchedulingPolicy, create_request_queue from vllm.v1.core.sched.utils import check_stop, remove_all from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs @@ -35,10 +40,6 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.spec_decode.metrics import SpecDecodingStats from vllm.v1.structured_output import StructuredOutputManager -if TYPE_CHECKING: - import numpy as np - import numpy.typing as npt - logger = init_logger(__name__) @@ -619,9 +620,6 @@ class Scheduler(SchedulerInterface): scheduled_spec_decode_tokens, req_to_new_blocks, ) - structured_output_request_ids, grammar_bitmask = self.get_grammar_bitmask( - num_scheduled_tokens.keys(), scheduled_spec_decode_tokens - ) # Record the request ids that were scheduled in this step. self.prev_step_scheduled_req_ids.clear() @@ -641,8 +639,6 @@ class Scheduler(SchedulerInterface): # the previous and the current steps. finished_req_ids=self.finished_req_ids, free_encoder_mm_hashes=self.encoder_cache_manager.get_freed_mm_hashes(), - structured_output_request_ids=structured_output_request_ids, - grammar_bitmask=grammar_bitmask, ) # NOTE(Kuntai): this function is designed for multiple purposes: @@ -872,9 +868,8 @@ class Scheduler(SchedulerInterface): def get_grammar_bitmask( self, - scheduled_request_ids: Iterable[str], - scheduled_spec_decode_tokens: dict[str, list[int]], - ) -> tuple[list[str], "npt.NDArray[np.int32] | None"]: + scheduler_output: SchedulerOutput, + ) -> GrammarOutput | None: # Collect list of scheduled request ids that use structured output. # The corresponding rows of the bitmask will be in this order. # PERF: in case of chunked prefill, @@ -883,18 +878,18 @@ class Scheduler(SchedulerInterface): # cycle to fill in the bitmask, which could be a big no-op. structured_output_request_ids = [ req_id - for req_id in scheduled_request_ids + for req_id in scheduler_output.num_scheduled_tokens if (req := self.requests.get(req_id)) and req.use_structured_output ] if not structured_output_request_ids: - return structured_output_request_ids, None + return None bitmask = self.structured_output_manager.grammar_bitmask( self.requests, structured_output_request_ids, - scheduled_spec_decode_tokens, + scheduler_output.scheduled_spec_decode_tokens, ) - return structured_output_request_ids, bitmask + return GrammarOutput(structured_output_request_ids, bitmask) def update_from_output( self, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index bfe87b718282c..78af197821e2e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -12,7 +12,7 @@ from concurrent.futures import Future from contextlib import ExitStack, contextmanager from inspect import isclass, signature from logging import DEBUG -from typing import Any, TypeVar +from typing import Any, TypeVar, cast import msgspec import zmq @@ -334,9 +334,12 @@ class EngineCore: if not self.scheduler.has_requests(): return {}, False scheduler_output = self.scheduler.schedule() - + future = self.model_executor.execute_model(scheduler_output, non_block=True) + grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output) with self.log_error_detail(scheduler_output): - model_output = self.model_executor.execute_model(scheduler_output) + model_output = future.result() + if model_output is None: + model_output = self.model_executor.sample_tokens(grammar_output) engine_core_outputs = self.scheduler.update_from_output( scheduler_output, model_output @@ -376,20 +379,47 @@ class EngineCore: assert len(batch_queue) < self.batch_queue_size model_executed = False + deferred_scheduler_output = None if self.scheduler.has_requests(): scheduler_output = self.scheduler.schedule() - future = self.model_executor.execute_model(scheduler_output, non_block=True) - batch_queue.appendleft((future, scheduler_output)) - + exec_future = self.model_executor.execute_model( + scheduler_output, non_block=True + ) model_executed = scheduler_output.total_num_scheduled_tokens > 0 - if ( - model_executed - and len(batch_queue) < self.batch_queue_size - and not batch_queue[-1][0].done() - ): - # Don't block on next worker response unless the queue is full - # or there are no more requests to schedule. - return None, True + + if scheduler_output.pending_structured_output_tokens: + # We need to defer sampling until we have processed the model output + # from the prior step. + deferred_scheduler_output = scheduler_output + # Block-wait for execute to return (continues running async on the GPU). + with self.log_error_detail(scheduler_output): + exec_result = exec_future.result() + assert exec_result is None + else: + # We aren't waiting for any tokens, get any grammar output immediately. + grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output) + # Block-wait for execute to return (continues running async on the GPU). + with self.log_error_detail(scheduler_output): + exec_result = exec_future.result() + + if exec_result is None: + # Call sample tokens. + future = self.model_executor.sample_tokens( + grammar_output, non_block=True + ) + else: + # No sampling required (e.g. all requests finished). + future = cast(Future[ModelRunnerOutput], exec_future) + # Add this step's future to the queue. + batch_queue.appendleft((future, scheduler_output)) + if ( + model_executed + and len(batch_queue) < self.batch_queue_size + and not batch_queue[-1][0].done() + ): + # Don't block on next worker response unless the queue is full + # or there are no more requests to schedule. + return None, True elif not batch_queue: # Queue is empty. We should not reach here since this method should @@ -405,6 +435,19 @@ class EngineCore: engine_core_outputs = self.scheduler.update_from_output( scheduler_output, model_output ) + + # NOTE(nick): We can either handle the deferred tasks here or save + # in a field and do it immediately once step_with_batch_queue is + # re-called. The latter slightly favors TTFT over TPOT/throughput. + if deferred_scheduler_output: + # We now have the tokens needed to compute the bitmask for the + # deferred request. Get the bitmask and call sample tokens. + grammar_output = self.scheduler.get_grammar_bitmask( + deferred_scheduler_output + ) + future = self.model_executor.sample_tokens(grammar_output, non_block=True) + batch_queue.appendleft((future, deferred_scheduler_output)) + return engine_core_outputs, model_executed def shutdown(self): diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index ef7840e1796f7..d76c6107ad2ba 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -16,7 +16,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.tasks import SupportedTask from vllm.utils.import_utils import resolve_obj_by_qualname -from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.engine import ReconfigureDistributedRequest from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput @@ -187,28 +187,44 @@ class Executor(ABC): @overload def execute_model( - self, - scheduler_output: SchedulerOutput, - non_block: Literal[False] = False, - ) -> ModelRunnerOutput: + self, scheduler_output: SchedulerOutput, non_block: Literal[False] = False + ) -> ModelRunnerOutput | None: pass @overload def execute_model( - self, - scheduler_output: SchedulerOutput, - non_block: Literal[True] = True, - ) -> Future[ModelRunnerOutput]: + self, scheduler_output: SchedulerOutput, non_block: Literal[True] = True + ) -> Future[ModelRunnerOutput | None]: pass def execute_model( self, scheduler_output: SchedulerOutput, non_block: bool = False - ) -> ModelRunnerOutput | Future[ModelRunnerOutput]: + ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]: output = self.collective_rpc( # type: ignore[call-overload] "execute_model", args=(scheduler_output,), non_block=non_block ) return output[0] + @overload + def sample_tokens( + self, grammar_output: GrammarOutput | None, non_block: Literal[False] = False + ) -> ModelRunnerOutput: + pass + + @overload + def sample_tokens( + self, grammar_output: GrammarOutput | None, non_block: Literal[True] = True + ) -> Future[ModelRunnerOutput]: + pass + + def sample_tokens( + self, grammar_output: GrammarOutput | None, non_block: bool = False + ) -> ModelRunnerOutput | Future[ModelRunnerOutput]: + output = self.collective_rpc( # type: ignore[call-overload] + "sample_tokens", args=(grammar_output,), non_block=non_block + ) + return output[0] + def execute_dummy_batch(self) -> None: self.collective_rpc("execute_dummy_batch") diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 4c58d5771c39b..999a3ba870ead 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -46,7 +46,7 @@ from vllm.utils.system_utils import ( get_mp_context, set_process_title, ) -from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput from vllm.v1.worker.worker_base import WorkerWrapperBase @@ -132,15 +132,12 @@ class MultiprocExecutor(Executor): uw.death_writer.close() self._ensure_worker_termination([uw.proc for uw in unready_workers]) - # For pipeline parallel, we use a thread pool for asynchronous - # execute_model. - if self.max_concurrent_batches > 1: - # Note: must use only 1 IO thread to keep dequeue sequence - # from the response queue - # _async_aggregate_workers_output also assumes a single IO thread - self.io_thread_pool = ThreadPoolExecutor( - max_workers=1, thread_name_prefix="mp_exec_io" - ) + # Note: must use only 1 IO thread to keep dequeue sequence + # from the response queue. + # _async_aggregate_workers_output also assumes a single IO thread. + self.io_thread_pool = ThreadPoolExecutor( + max_workers=1, thread_name_prefix="mp_exec_io" + ) self.output_rank = self._get_output_rank() self.has_connector = self.vllm_config.kv_transfer_config is not None @@ -180,15 +177,27 @@ class MultiprocExecutor(Executor): self.failure_callback = callback def execute_model( # type: ignore[override] - self, - scheduler_output: SchedulerOutput, - non_block: bool = False, + self, scheduler_output: SchedulerOutput, non_block: bool = False + ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]: + return self._execute_with_aggregation( + "execute_model", scheduler_output, non_block=non_block + ) + + def sample_tokens( # type: ignore[override] + self, grammar_output: GrammarOutput | None, non_block: bool = False ) -> ModelRunnerOutput | Future[ModelRunnerOutput]: + return self._execute_with_aggregation( # type: ignore[return-value] + "sample_tokens", grammar_output, non_block=non_block + ) + + def _execute_with_aggregation( + self, method: str, *args, non_block: bool = False + ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]: if not self.has_connector: # get output only from a single worker (output_rank) (output,) = self.collective_rpc( - "execute_model", - args=(scheduler_output,), + method, + args=args, unique_reply_rank=self.output_rank, non_block=non_block, timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS, @@ -197,8 +206,8 @@ class MultiprocExecutor(Executor): # get output from all workers outputs = self.collective_rpc( - "execute_model", - args=(scheduler_output,), + method, + args=args, non_block=non_block, timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS, ) diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py index a4823acc87642..4a69cca723ac9 100644 --- a/vllm/v1/executor/ray_executor.py +++ b/vllm/v1/executor/ray_executor.py @@ -19,7 +19,7 @@ from vllm.utils.network_utils import ( get_ip, get_open_port, ) -from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.executor.abstract import Executor from vllm.v1.executor.ray_utils import ( @@ -41,6 +41,9 @@ if TYPE_CHECKING: logger = init_logger(__name__) +COMPLETED_NONE_FUTURE: Future[ModelRunnerOutput | None] = Future() +COMPLETED_NONE_FUTURE.set_result(None) + @dataclass class RayWorkerMetaData: @@ -96,6 +99,8 @@ class RayDistributedExecutor(Executor): # KV connector setup self.has_connector = self.vllm_config.kv_transfer_config is not None + self.scheduler_output: SchedulerOutput | None = None + @property def max_concurrent_batches(self) -> int: """Ray distributed executor supports pipeline parallelism, @@ -381,22 +386,46 @@ class RayDistributedExecutor(Executor): self.shutdown() def execute_model( # type: ignore[override] - self, scheduler_output: SchedulerOutput, non_block: bool = False + self, + scheduler_output: SchedulerOutput, + non_block: bool = False, + ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]: + if self.scheduler_output is not None: + raise RuntimeError( + "State error: sample_tokens() must be called " + "after execute_model() returns None." + ) + self.scheduler_output = scheduler_output + return COMPLETED_NONE_FUTURE if non_block else None + + def sample_tokens( # type: ignore[override] + self, + grammar_output: "GrammarOutput | None", + non_block: bool = False, ) -> ModelRunnerOutput | Future[ModelRunnerOutput]: """Execute the model on the Ray workers. + The scheduler output to use should have been provided in + a prior call to execute_model(). + Args: - scheduler_output: The scheduler output to execute. + grammar_output: The structured outputs grammar bitmask, if applicable. non_block: If True, the method will return a Future. Returns: The model runner output. """ + scheduler_output = self.scheduler_output + if scheduler_output is None: + return None # noqa + + self.scheduler_output = None + # Build the compiled DAG for the first time. if self.forward_dag is None: # type: ignore self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) - refs = self.forward_dag.execute(scheduler_output) # type: ignore + refs = self.forward_dag.execute((scheduler_output, grammar_output)) # type: ignore if not self.has_connector: # Get output only from a single worker (output_rank) diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py index 9385e55b066f8..a282cdc9909db 100644 --- a/vllm/v1/executor/ray_utils.py +++ b/vllm/v1/executor/ray_utils.py @@ -19,7 +19,7 @@ from vllm.v1.outputs import AsyncModelRunnerOutput from vllm.v1.worker.worker_base import WorkerWrapperBase if TYPE_CHECKING: - from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.outputs import ModelRunnerOutput logger = init_logger(__name__) @@ -82,36 +82,41 @@ try: def execute_model_ray( self, - scheduler_output: Union[ - "SchedulerOutput", tuple["SchedulerOutput", "IntermediateTensors"] - ], + execute_model_input: tuple["SchedulerOutput", "GrammarOutput"] + | tuple["SchedulerOutput", "GrammarOutput", "IntermediateTensors"], ) -> Union[ - "ModelRunnerOutput", tuple["SchedulerOutput", "IntermediateTensors"] + "ModelRunnerOutput", + tuple["SchedulerOutput", "GrammarOutput", "IntermediateTensors"], ]: # This method is used by Ray Compiled Graph to execute the model, # and it needs a special logic of self.setup_device_if_necessary() self.setup_device_if_necessary() assert self.worker is not None, "Worker is not initialized" - if isinstance(scheduler_output, tuple): - scheduler_output, intermediate_tensors = scheduler_output + if len(execute_model_input) == 3: + scheduler_output, grammar_output, intermediate_tensors = ( + execute_model_input + ) else: - scheduler_output, intermediate_tensors = scheduler_output, None + scheduler_output, grammar_output = execute_model_input + intermediate_tensors = None assert self.worker.model_runner is not None output = self.worker.model_runner.execute_model( scheduler_output, intermediate_tensors ) if isinstance(output, IntermediateTensors): - output = scheduler_output, output + output = scheduler_output, grammar_output, output elif not get_pp_group().is_last_rank: # Case where there are no scheduled requests # but may still be finished requests. assert not output or not output.req_ids - output = scheduler_output, None - # Ensure outputs crossing Ray compiled DAG are serializable. - # AsyncModelRunnerOutput holds CUDA events and cannot be - # pickled. - if isinstance(output, AsyncModelRunnerOutput): - output = output.get_output() + output = scheduler_output, grammar_output, None + elif output is None: + output = self.worker.model_runner.sample_tokens(grammar_output) + # Ensure outputs crossing Ray compiled DAG are serializable. + # AsyncModelRunnerOutput holds CUDA events and cannot be + # pickled. + if isinstance(output, AsyncModelRunnerOutput): + output = output.get_output() return output def override_env_vars(self, vars: dict[str, str]): diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index ef9bae2367bed..d2d14fcfc4362 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -16,6 +16,7 @@ from diskcache import Cache import vllm.envs as envs from vllm.logger import init_logger from vllm.utils.import_utils import LazyLoader +from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput if TYPE_CHECKING: import outlines_core as oc @@ -24,7 +25,6 @@ if TYPE_CHECKING: import xgrammar as xgr from vllm.transformers_utils.tokenizer import AnyTokenizer - from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch else: xgr = LazyLoader("xgr", globals(), "xgrammar") @@ -47,6 +47,7 @@ CACHE = None def apply_grammar_bitmask( scheduler_output: SchedulerOutput, + grammar_output: GrammarOutput, input_batch: InputBatch, logits: torch.Tensor, ) -> None: @@ -58,9 +59,9 @@ def apply_grammar_bitmask( input_batch (InputBatch): The input of model runner. logits (torch.Tensor): The output logits of model forward. """ - grammar_bitmask = scheduler_output.grammar_bitmask - if grammar_bitmask is None: - return + # Serialization of np.ndarray is much more efficient than a tensor, + # so we receive it in that format. + grammar_bitmask = grammar_output.grammar_bitmask # We receive the structured output bitmask from the scheduler, # compacted to contain bitmasks only for structured output requests. @@ -79,7 +80,7 @@ def apply_grammar_bitmask( cumulative_offset += len( scheduler_output.scheduled_spec_decode_tokens.get(req_id, []) ) - if req_id in scheduler_output.structured_output_request_ids: + if req_id in grammar_output.structured_output_request_ids: struct_out_req_batch_indices[req_id] = logit_index out_indices = [] @@ -91,7 +92,7 @@ def apply_grammar_bitmask( dtype=grammar_bitmask.dtype, ) cumulative_index = 0 - for req_id in scheduler_output.structured_output_request_ids: + for req_id in grammar_output.structured_output_request_ids: num_spec_tokens = len( scheduler_output.scheduled_spec_decode_tokens.get(req_id, []) ) @@ -101,22 +102,28 @@ def apply_grammar_bitmask( sorted_bitmask[logit_index + i] = grammar_bitmask[cumulative_index + i] out_indices.append(logit_index + i) cumulative_index += 1 + num_spec_tokens - grammar_bitmask = sorted_bitmask + + # Copy async to device as tensor. + grammar_bitmask = torch.from_numpy(sorted_bitmask).to( + logits.device, non_blocking=True + ) # If the length of out indices and the logits have the same shape # we don't need to pass indices to the kernel, # since the bitmask is already aligned with the logits. skip_out_indices = len(out_indices) == logits.shape[0] - # Serialization of np.ndarray is much more efficient than a tensor, - # so we receive it in that format. - grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous() + index_tensor = None + if not skip_out_indices: + # xgrammar expects a python list of indices but it will actually work with + # a tensor. If we copy the tensor ourselves here we can do it in a non_blocking + # manner and there should be no cpu sync within xgrammar. + index_tensor = torch.tensor( + out_indices, dtype=torch.int32, device="cpu", pin_memory=True + ) + index_tensor = index_tensor.to(logits.device, non_blocking=True) - xgr.apply_token_bitmask_inplace( - logits, - grammar_bitmask.to(logits.device, non_blocking=True), - indices=out_indices if not skip_out_indices else None, - ) + xgr.apply_token_bitmask_inplace(logits, grammar_bitmask, indices=index_tensor) class OutlinesVocabulary: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ba852bb89f33d..66a9d72912618 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -109,6 +109,7 @@ from vllm.v1.outputs import ( EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, + KVConnectorOutput, LogprobsLists, LogprobsTensors, ModelRunnerOutput, @@ -150,7 +151,7 @@ from .utils import ( if TYPE_CHECKING: from vllm.model_executor.model_loader.tensorizer import TensorizerConfig - from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput logger = init_logger(__name__) @@ -218,6 +219,20 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput): return output +class ExecuteModelState(NamedTuple): + """Ephemeral cached state transferred between execute_model() and + sample_tokens(), after execute_model() returns None.""" + + scheduler_output: "SchedulerOutput" + logits: torch.Tensor + spec_decode_metadata: SpecDecodeMetadata | None + spec_decode_common_attn_metadata: CommonAttentionMetadata | None + hidden_states: torch.Tensor + sample_hidden_states: torch.Tensor + aux_hidden_states: list[torch.Tensor] | None + kv_connector_output: KVConnectorOutput | None + + class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def __init__( self, @@ -509,6 +524,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): pin_memory=self.pin_memory, ) + # Ephemeral state transferred between execute_model() and sample_tokens(). + self.execute_model_state: ExecuteModelState | None = None + def reset_mm_cache(self) -> None: if self.mm_budget: self.mm_budget.reset_cache() @@ -2113,7 +2131,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_input_tokens: int, # Padded intermediate_tensors: IntermediateTensors | None = None, ) -> tuple[ - int, torch.Tensor | None, torch.Tensor | None, torch.Tensor, @@ -2207,7 +2224,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model_kwargs.update(encoder_inputs) return ( - num_scheduled_tokens, input_ids, inputs_embeds, positions, @@ -2425,13 +2441,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self, scheduler_output: "SchedulerOutput", intermediate_tensors: IntermediateTensors | None = None, - ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors: + ) -> ModelRunnerOutput | IntermediateTensors | None: + if self.execute_model_state is not None: + raise RuntimeError( + "State error: sample_tokens() must be called " + "after execute_model() returns None." + ) + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens with record_function_or_nullcontext("Preprocess"): with self.synchronize_input_prep(): # Update persistent batch states. self._update_states(scheduler_output) - if not scheduler_output.total_num_scheduled_tokens: + if not num_scheduled_tokens: if not has_kv_transfer_group(): # Return empty ModelRunnerOutput if no work to do. return EMPTY_MODEL_RUNNER_OUTPUT @@ -2471,7 +2493,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) ( - num_scheduled_tokens, input_ids, inputs_embeds, positions, @@ -2559,6 +2580,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Rare case. assert not self.is_pooling_model + sample_hidden_states = hidden_states[logits_indices] if not get_pp_group().is_last_rank: all_gather_tensors = { "residual": not is_residual_scattered_for_sp( @@ -2572,7 +2594,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) logits = None else: - sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(sample_hidden_states) model_output_broadcast_data = {} @@ -2585,9 +2606,45 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert model_output_broadcast_data is not None logits = model_output_broadcast_data["logits"] - # Apply structured output bitmasks if present - if scheduler_output.structured_output_request_ids: - apply_grammar_bitmask(scheduler_output, self.input_batch, logits) + self.execute_model_state = ExecuteModelState( + scheduler_output, + logits, + spec_decode_metadata, + spec_decode_common_attn_metadata, + hidden_states, + sample_hidden_states, + aux_hidden_states, + kv_connector_output, + ) + return None + + @torch.inference_mode + def sample_tokens( + self, grammar_output: "GrammarOutput | None" + ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors: + if self.execute_model_state is None: + # Nothing to do (PP non-final rank case), output isn't used. + return None # noqa + + # Unpack ephemeral state. + ( + scheduler_output, + logits, + spec_decode_metadata, + spec_decode_common_attn_metadata, + hidden_states, + sample_hidden_states, + aux_hidden_states, + kv_connector_output, + ) = self.execute_model_state + # Clear ephemeral state. + self.execute_model_state = None + + # Apply structured output bitmasks if present. + if grammar_output is not None: + apply_grammar_bitmask( + scheduler_output, grammar_output, self.input_batch, logits + ) with record_function_or_nullcontext("Sample"): sampler_output = self._sample(logits, spec_decode_metadata) @@ -2646,7 +2703,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): sampler_output, logits, hidden_states, - num_scheduled_tokens, + scheduler_output.total_num_scheduled_tokens, spec_decode_metadata, ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 5b11bdf5282fa..c2bf1419bebd7 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -6,6 +6,7 @@ import copy import gc import os from contextlib import AbstractContextManager, nullcontext +from types import NoneType from typing import TYPE_CHECKING, Any import torch @@ -37,6 +38,7 @@ from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_utils import MemorySnapshot, memory_profiling +from vllm.v1.core.sched.output import GrammarOutput from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ( @@ -508,11 +510,16 @@ class Worker(WorkerBase): def get_supported_tasks(self) -> tuple[SupportedTask, ...]: return self.model_runner.get_supported_tasks() + @torch.inference_mode() + def sample_tokens( + self, grammar_output: "GrammarOutput" + ) -> ModelRunnerOutput | AsyncModelRunnerOutput: + return self.model_runner.sample_tokens(grammar_output) + @torch.inference_mode() def execute_model( - self, - scheduler_output: "SchedulerOutput", - ) -> ModelRunnerOutput | AsyncModelRunnerOutput | None: + self, scheduler_output: "SchedulerOutput" + ) -> ModelRunnerOutput | None: intermediate_tensors = None forward_pass = scheduler_output.total_num_scheduled_tokens > 0 num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens @@ -531,13 +538,13 @@ class Worker(WorkerBase): ) output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) - if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)): + if isinstance(output, (ModelRunnerOutput, NoneType)): return output assert isinstance(output, IntermediateTensors) parallel_config = self.vllm_config.parallel_config assert ( - parallel_config.distributed_executor_backend != ("external_launcher") + parallel_config.distributed_executor_backend != "external_launcher" and not get_pp_group().is_last_rank ) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 0ced138b940d0..0e34504a5e268 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -92,7 +92,7 @@ from .utils import ( ) if TYPE_CHECKING: - from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput logger = init_logger(__name__) @@ -372,6 +372,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): else: self.sample_from_logits_func = self.sample_from_logits + # For passing scheduler_output between successive + # execute_model() and sample_tokens() calls. + self.scheduler_output: SchedulerOutput | None = None + self.mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None + def reset_mm_cache(self) -> None: if self.mm_budget: self.mm_budget.reset_cache() @@ -1078,7 +1083,12 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self, scheduler_output: "SchedulerOutput", intermediate_tensors: IntermediateTensors | None = None, - ) -> ModelRunnerOutput: + ) -> ModelRunnerOutput | None: + if self.scheduler_output is not None: + raise RuntimeError( + "State error: sample_tokens() must be called " + "after execute_model() returns None." + ) # Update cached state self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: @@ -1088,14 +1098,30 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return self.kv_connector_no_forward(scheduler_output, self.vllm_config) + mm_embed_inputs = None if self.supports_mm_inputs: # Run the multimodal encoder if any. self._execute_mm_encoder(scheduler_output) mm_embed_inputs = self._gather_mm_embeddings(scheduler_output) - else: - mm_embed_inputs = None torch_xla.sync(wait=False) + + self.scheduler_output = scheduler_output + self.mm_embed_inputs = mm_embed_inputs + return None + + @torch.no_grad() + def sample_tokens( + self, grammar_output: "GrammarOutput | None" + ) -> ModelRunnerOutput: + if self.scheduler_output is None: + # Nothing to do (PP non-final rank case), output isn't used. + return None # noqa + scheduler_output = self.scheduler_output + mm_embed_inputs = self.mm_embed_inputs + self.scheduler_output = None + self.mm_embed_inputs = None + # Prepare inputs, the requests might be split into multiple # executions, combine the result of each execution. start_index = 0 @@ -1131,9 +1157,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): tpu_sampling_metadata = TPUSupportedSamplingMetadata.from_input_batch( self.input_batch, padded_num_reqs, self.device ) - if scheduler_output.grammar_bitmask is not None: + if grammar_output is not None: require_struct_decoding, grammar_bitmask_padded, arange = ( - self.prepare_structured_decoding_input(logits, scheduler_output) + self.prepare_structured_decoding_input(logits, grammar_output) ) logits = self.structured_decode( require_struct_decoding, grammar_bitmask_padded, logits, arange @@ -1954,10 +1980,9 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): return self.model.get_input_embeddings(*args, **kwargs) def prepare_structured_decoding_input( - self, logits: torch.Tensor, scheduler_output: "SchedulerOutput" + self, logits: torch.Tensor, grammar_output: "GrammarOutput" ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - grammar_bitmask = scheduler_output.grammar_bitmask - assert grammar_bitmask is not None + grammar_bitmask = grammar_output.grammar_bitmask num_reqs, _ = logits.shape # Reset pre-allocated tensors @@ -1965,7 +1990,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.require_structured_out_cpu.zero_() cumulative_mask_idx = 0 - for req_id in scheduler_output.structured_output_request_ids: + for req_id in grammar_output.structured_output_request_ids: if req_id not in self.input_batch.req_id_to_index: continue batch_index = self.input_batch.req_id_to_index[req_id] diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index e867e3c07caa5..a716a9c3aa822 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -17,7 +17,6 @@ from vllm.distributed import ( ) from vllm.distributed.kv_transfer import ( ensure_kv_transfer_initialized, - has_kv_transfer_group, ) from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -27,7 +26,7 @@ from vllm.platforms.tpu import USE_TPU_INFERENCE from vllm.tasks import SupportedTask from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE -from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.utils import report_usage_stats @@ -255,13 +254,13 @@ class TPUWorker: tpu_kv_cache_bytes = tpu_kv_cache_bytes * head_size // padded_head_size return int(tpu_kv_cache_bytes) + def sample_tokens(self, grammar_output: "GrammarOutput") -> ModelRunnerOutput: + return self.model_runner.sample_tokens(grammar_output) + def execute_model( - self, - scheduler_output: "SchedulerOutput", + self, scheduler_output: "SchedulerOutput" ) -> ModelRunnerOutput | None: - output = self.model_runner.execute_model(scheduler_output) - # every worker's output is needed when kv_transfer_group is set up - return output if self.is_driver_worker or has_kv_transfer_group() else None + return self.model_runner.execute_model(scheduler_output) def profile(self, is_start: bool = True): if self.rank < 1: diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 9162e2e85a517..30ea0ab77bd9e 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -20,10 +20,12 @@ from vllm.v1.kv_cache_interface import KVCacheSpec from vllm.v1.serial_utils import run_method if TYPE_CHECKING: - from vllm.v1.core.sched.output import SchedulerOutput - from vllm.v1.outputs import ModelRunnerOutput + from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput + from vllm.v1.outputs import AsyncModelRunnerOutput, ModelRunnerOutput else: SchedulerOutput = object + GrammarOutput = object + AsyncModelRunnerOutput = object ModelRunnerOutput = object logger = init_logger(__name__) @@ -122,7 +124,21 @@ class WorkerBase: """Load model onto target device.""" raise NotImplementedError - def execute_model(self, scheduler_output: SchedulerOutput) -> ModelRunnerOutput: + def execute_model( + self, scheduler_output: SchedulerOutput + ) -> ModelRunnerOutput | None: + """If this method returns None, sample_tokens should be called immediately after + to obtain the ModelRunnerOutput. + + Note that this design may be changed in future if/when structured outputs + parallelism is re-architected. + """ + raise NotImplementedError + + def sample_tokens( + self, grammar_output: GrammarOutput + ) -> ModelRunnerOutput | AsyncModelRunnerOutput: + """Should be called immediately after execute_model iff it returned None.""" raise NotImplementedError def get_cache_block_size_bytes(self) -> int: @@ -344,7 +360,7 @@ class WorkerWrapperBase: scheduler_output: SchedulerOutput, *args, **kwargs, - ) -> ModelRunnerOutput: + ) -> ModelRunnerOutput | None: self._apply_mm_cache(scheduler_output) assert self.worker is not None From bc4486d60962a0e251fe8d98fc26cfca206fd54c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 1 Nov 2025 10:05:12 +0800 Subject: [PATCH 062/976] [Kernel] Enable FusedMoEModularKernel support bias (#27754) Signed-off-by: Jee Jee Li --- vllm/lora/layers/fused_moe.py | 43 +++++++------------ vllm/model_executor/layers/fused_moe/layer.py | 2 - 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 5a9fd35c2907a..275a2ed0c6813 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -15,9 +15,7 @@ from vllm.distributed.parallel_state import ( from vllm.lora.layers.base import BaseLayerWithLoRA from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import ( - FUSED_MOE_UNQUANTIZED_CONFIG, _get_config_dtype_str, - mxfp4_w4a16_moe_quant_config, ) from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( modular_marlin_fused_moe, @@ -26,13 +24,16 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( modular_triton_fused_moe, try_get_optimal_moe_config, ) -from vllm.model_executor.layers.quantization.mxfp4 import Mxfp4Config class FusedMoEWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: FusedMoE) -> None: super().__init__() self.base_layer = base_layer + + assert not self.base_layer.use_ep, ( + "EP support for Fused MoE LoRA is not implemented yet." + ) self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() self.device = base_layer.w2_weight.device @@ -42,17 +43,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): moe_state_dict = {} top_k = self.base_layer.top_k - if self.base_layer.quant_config is None: - quant_config = FUSED_MOE_UNQUANTIZED_CONFIG - elif not isinstance(self.base_layer.quant_config, Mxfp4Config): - quant_config = self.base_layer.quant_config - else: - quant_config = mxfp4_w4a16_moe_quant_config( - w1_bias=self.base_layer.w13_bias, - w2_bias=self.base_layer.w2_bias, - w1_scale=self.base_layer.w13_weight_scale, - w2_scale=self.base_layer.w2_weight_scale, - ) + self.base_layer.ensure_moe_quant_config_init() + quant_config = self.base_layer.quant_method.moe_quant_config m_fused_moe_fn = ( modular_triton_fused_moe( @@ -69,7 +61,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): moe_state_dict["hidden_states"] = kwargs["hidden_states"] moe_state_dict["topk_ids"] = kwargs["topk_ids"] moe_state_dict["topk_weights"] = kwargs["topk_weights"] - moe_state_dict["global_num_experts"] = kwargs["global_num_experts"] moe_state_dict["expert_map"] = kwargs["expert_map"] moe_state_dict["apply_router_weight_on_input"] = kwargs[ "apply_router_weight_on_input" @@ -86,7 +77,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): hidden_states = moe_state_dict["hidden_states"] topk_weights = moe_state_dict["topk_weights"] curr_topk_ids = moe_state_dict["topk_ids"] - global_num_experts = moe_state_dict["global_num_experts"] + expert_map = moe_state_dict["expert_map"] config_dtype = _get_config_dtype_str( @@ -118,7 +109,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): curr_topk_ids, num_tokens, config["BLOCK_SIZE_M"], - global_num_experts, + self.base_layer.local_num_experts, max_loras, expert_map, ) @@ -236,14 +227,10 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): ) -> None: """Initializes lora matrices.""" - assert not self.base_layer.use_ep, ( - "EP support for Fused MoE LoRA is not implemented yet." - ) - self.w1_lora_a_stacked = torch.zeros( ( max_loras, - self.base_layer.global_num_experts, + self.base_layer.local_num_experts, lora_config.max_lora_rank, self.base_layer.hidden_size, ), @@ -253,7 +240,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.w1_lora_b_stacked = torch.zeros( ( max_loras, - self.base_layer.global_num_experts, + self.base_layer.local_num_experts, self.base_layer.intermediate_size_per_partition, lora_config.max_lora_rank, ), @@ -264,7 +251,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.w2_lora_a_stacked = torch.zeros( ( max_loras, - self.base_layer.global_num_experts, + self.base_layer.local_num_experts, lora_config.max_lora_rank, self.base_layer.intermediate_size_per_partition, ), @@ -274,7 +261,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.w2_lora_b_stacked = torch.zeros( ( max_loras, - self.base_layer.global_num_experts, + self.base_layer.local_num_experts, self.base_layer.hidden_size, lora_config.max_lora_rank, ), @@ -285,7 +272,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.w3_lora_a_stacked = torch.zeros( ( max_loras, - self.base_layer.global_num_experts, + self.base_layer.local_num_experts, lora_config.max_lora_rank, self.base_layer.hidden_size, ), @@ -295,7 +282,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.w3_lora_b_stacked = torch.zeros( ( max_loras, - self.base_layer.global_num_experts, + self.base_layer.local_num_experts, self.base_layer.intermediate_size_per_partition, lora_config.max_lora_rank, ), @@ -308,7 +295,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.lora_a_stacked = [] self.lora_b_stacked = [] for lora_id in range(max_loras): - for experts_id in range(self.base_layer.global_num_experts): + for experts_id in range(self.base_layer.local_num_experts): # gate_proj,down_proj,up_proj self.lora_a_stacked.append(self.w1_lora_a_stacked[lora_id][experts_id]) self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id]) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 7dbe4bc543941..46d351b48c5e8 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -672,8 +672,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): apply_router_weight_on_input=apply_router_weight_on_input, ) elif self.fused_experts is not None: - if self.moe.has_bias: - raise ValueError("FusedMoEModularKernel does not support bias.") result = self.fused_experts( hidden_states=x, w1=layer.w13_weight, From 3a5de7d2d6e65b6580c3ceb905334843a7b6dd6f Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 1 Nov 2025 11:54:36 +0800 Subject: [PATCH 063/976] [Bugfix] Fix KDA output (#27905) Signed-off-by: Jee Jee Li --- vllm/model_executor/layers/kda.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py index 308bc8be1dece..26458f2e3c4da 100644 --- a/vllm/model_executor/layers/kda.py +++ b/vllm/model_executor/layers/kda.py @@ -259,7 +259,7 @@ class KimiDeltaAttention(nn.Module, MambaBase): hidden_states: torch.Tensor, positions: torch.Tensor, output: torch.Tensor, - ) -> torch.Tensor: + ) -> None: num_tokens = hidden_states.size(0) q = self.q_proj(hidden_states)[0] k = self.k_proj(hidden_states)[0] @@ -291,8 +291,7 @@ class KimiDeltaAttention(nn.Module, MambaBase): ) core_attn_out = self.o_norm(core_attn_out, g2) core_attn_out = rearrange(core_attn_out, "1 n h d -> n (h d)") - - return self.o_proj(core_attn_out)[0] + output[:] = self.o_proj(core_attn_out)[0] def _forward( self, From 7e2729b57e5c7420e945b6cf21850374195984c7 Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Sat, 1 Nov 2025 12:45:02 +0800 Subject: [PATCH 064/976] [Multimodal][XPU]Enable vision attn backend for xpu platform (#27525) Signed-off-by: Yan Ma Signed-off-by: Kunshang Ji Co-authored-by: Yejing Lai Co-authored-by: Guancheng Fu <110874468+gc-fu@users.noreply.github.com> Co-authored-by: Kunshang Ji --- vllm/_ipex_ops.py | 84 +++++++++++++++++------- vllm/attention/layer.py | 35 +++++----- vllm/attention/ops/vit_attn_wrappers.py | 2 +- vllm/model_executor/models/qwen2_5_vl.py | 7 +- vllm/model_executor/models/qwen2_vl.py | 5 +- vllm/platforms/xpu.py | 6 ++ 6 files changed, 88 insertions(+), 51 deletions(-) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 60ee0124c3d9c..95c17cb331f67 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -270,21 +270,23 @@ class ipex_ops: @staticmethod def flash_attn_varlen_func( - out: torch.Tensor, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, cu_seqlens_q: torch.Tensor, - seqused_k: torch.Tensor, # we don't support this in ipex kernel max_seqlen_q: int, max_seqlen_k: int, - softmax_scale: float, - causal: bool, - block_table: torch.Tensor, - alibi_slopes: torch.Tensor | None, + softmax_scale: float | None = None, + causal: bool = False, + out: torch.Tensor | None = None, + block_table: torch.Tensor | None = None, + alibi_slopes: torch.Tensor | None = None, window_size: list[int] | None = None, softcap: float | None = 0.0, + seqused_k: torch.Tensor | None = None, cu_seqlens_k: torch.Tensor | None = None, + # passed in qwen vl + dropout_p: float = 0.0, # The following parameters are not used in ipex kernel currently, # we keep API compatible to CUDA's. scheduler_metadata=None, @@ -295,31 +297,63 @@ class ipex_ops: num_splits=0, s_aux: torch.Tensor | None = None, ): + if out is None: + out = torch.empty(q.shape, dtype=q.dtype, device=q.device) real_window_size: tuple[int, int] if window_size is None: real_window_size = (-1, -1) else: assert len(window_size) == 2 real_window_size = (window_size[0], window_size[1]) - return ipex.llm.modules.PagedAttention.flash_attn_varlen_func( - out, - q.contiguous(), - k, - v, - cu_seqlens_q, - seqused_k, - max_seqlen_q, - max_seqlen_k, - softmax_scale, - causal, - block_table, - alibi_slopes, - softcap=softcap, - window_size_left=real_window_size[0], - window_size_right=real_window_size[1], - k_scale=1.0, - v_scale=1.0, - ) + + if block_table is None: + assert cu_seqlens_k is not None, ( + "cu_seqlens_k can't be None when calling varlen_attention." + ) + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + ipex_ops.varlen_attention( + q.contiguous(), + k.contiguous(), + v.contiguous(), + out, + cu_seqlens_q, + cu_seqlens_k, + None, + max_seqlen_q, + max_seqlen_k, + 0.0, + softmax_scale, + False, + causal, + False, + None, + real_window_size[0], + real_window_size[1], + -1, + ) + return out + else: + return ipex.llm.modules.PagedAttention.flash_attn_varlen_func( + out, + q.contiguous(), + k, + v, + cu_seqlens_q, + seqused_k, + max_seqlen_q, + max_seqlen_k, + softmax_scale, + causal, + block_table, + alibi_slopes, + sink=s_aux, + softcap=softcap, + window_size_left=real_window_size[0], + window_size_right=real_window_size[1], + k_scale=1.0, + v_scale=1.0, + ) @staticmethod def get_scheduler_metadata( diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 22eaa22b8b385..17e025155a431 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -123,6 +123,11 @@ def maybe_get_vit_flash_attn_backend( ): attn_backend = _Backend.FLASH_ATTN use_upstream_fa = True + elif current_platform.is_xpu(): + assert attn_backend == _Backend.FLASH_ATTN, ( + "XPU platform only supports FLASH_ATTN as vision attention backend." + ) + use_upstream_fa = False else: return _Backend.TORCH_SDPA, None @@ -133,7 +138,7 @@ def maybe_get_vit_flash_attn_backend( if use_upstream_fa: from flash_attn import flash_attn_varlen_func else: - from vllm.vllm_flash_attn import flash_attn_varlen_func + from vllm.attention.utils.fa_utils import flash_attn_varlen_func else: flash_attn_varlen_func = None @@ -521,22 +526,18 @@ class MultiHeadAttention(nn.Module): # If vllm native fa is selected, we use it directly. use_upstream_fa = False - if current_platform.is_xpu(): - # currently, only torch_sdpa is supported on xpu - self.attn_backend = _Backend.TORCH_SDPA - else: - self.attn_backend = ( - backend - if backend - in { - _Backend.TORCH_SDPA, - _Backend.XFORMERS, - _Backend.PALLAS, - _Backend.ROCM_AITER_FA, - _Backend.FLASH_ATTN, - } - else _Backend.TORCH_SDPA - ) + self.attn_backend = ( + backend + if backend + in { + _Backend.TORCH_SDPA, + _Backend.XFORMERS, + _Backend.PALLAS, + _Backend.ROCM_AITER_FA, + _Backend.FLASH_ATTN, + } + else _Backend.TORCH_SDPA + ) self.attn_backend, self._flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py index f71f49a1a31b0..6cefe74416685 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/attention/ops/vit_attn_wrappers.py @@ -70,7 +70,7 @@ def flash_attn_maxseqlen_wrapper( if use_upstream_fa: from flash_attn import flash_attn_varlen_func else: - from vllm.vllm_flash_attn import flash_attn_varlen_func + from vllm.attention.utils.fa_utils import flash_attn_varlen_func q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) output = flash_attn_varlen_func( q, diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 3d67653726bd8..3585783e4ccc3 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -364,6 +364,8 @@ class Qwen2_5_VisionAttention(nn.Module): if current_platform.is_rocm() and self.attn_backend == _Backend.FLASH_ATTN: self.use_upstream_fa = True + if current_platform.is_xpu(): + self.use_upstream_fa = False self.is_flash_attn_backend = self.attn_backend in { _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA, @@ -856,10 +858,7 @@ class Qwen2_5_VisionTransformer(nn.Module): ) -> tuple[torch.Tensor, torch.Tensor]: max_seqlen = torch.zeros([], device=cu_seqlens.device) seqlens = torch.zeros(1, device=cu_seqlens.device) - if ( - self.attn_backend == _Backend.FLASH_ATTN - or self.attn_backend == _Backend.ROCM_AITER_FA - ): + if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}: max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() elif self.attn_backend == _Backend.XFORMERS: seqlens = cu_seqlens[1:] - cu_seqlens[:-1] diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index f0d7e2e7d7eca..a81acf9f9a36d 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -789,10 +789,7 @@ class Qwen2VisionTransformer(nn.Module): self, cu_seqlens: torch.Tensor ) -> tuple[int | None, list[int] | None]: max_seqlen, seqlens = None, None - if ( - self.attn_backend == _Backend.FLASH_ATTN - or self.attn_backend == _Backend.ROCM_AITER_FA - ): + if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}: max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() elif self.attn_backend == _Backend.XFORMERS: seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index cd65cba6b492c..07ab759e4baa6 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -115,6 +115,12 @@ class XPUPlatform(Platform): device_props = torch.xpu.get_device_properties(device_id) return device_props.total_memory + @classmethod + def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> _Backend: + from vllm.attention.backends.registry import _Backend + + return _Backend.FLASH_ATTN + @classmethod def inference_mode(cls): return torch.no_grad() From 29de3cdee4dd7f805931b459398b15c3b5f7057c Mon Sep 17 00:00:00 2001 From: yugong333 Date: Fri, 31 Oct 2025 21:55:46 -0700 Subject: [PATCH 065/976] Adding SplitK in fused_moe_lora kernel (#27818) Signed-off-by: Yu Gong Co-authored-by: Jee Jee Li --- vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index e681f3882908e..15031f5e2f9e8 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -88,14 +88,17 @@ def _fused_moe_lora_kernel( grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K) # calculate pid_m,pid_n + pid_sk = pid % SPLIT_K + pid_m_n = pid // SPLIT_K num_pid_m = tl.cdiv(EM, BLOCK_SIZE_M) num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n - group_id = pid // num_pid_in_group + group_id = pid_m_n // num_pid_in_group first_pid_m = group_id * GROUP_SIZE_M group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) - pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m) - pid_n = (pid % num_pid_in_group) // group_size_m + pid_m = first_pid_m + ((pid_m_n % num_pid_in_group) % group_size_m) + pid_n = (pid_m_n % num_pid_in_group) // group_size_m num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr + lora_idx) if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: @@ -113,7 +116,7 @@ def _fused_moe_lora_kernel( cur_c_ptr = c_ptr + (slice_id % num_slice_c) * slice_c_size offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N - offs_k = tl.arange(0, BLOCK_SIZE_K) + offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) token_ind = stride_tl * lora_idx + offs_token_id @@ -131,7 +134,8 @@ def _fused_moe_lora_kernel( cur_b_ptr + lora_idx * stride_bl + expert_id * stride_be - + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + offs_k[:, None] * stride_bk + + offs_bn[None, :] * stride_bn ) # accumulator From 879a06579ea8a057c250e9b6cc4e632dabd87d2e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 1 Nov 2025 13:11:07 +0800 Subject: [PATCH 066/976] [CI/Build] Bump transformers version (#27528) Signed-off-by: DarkLight1337 Signed-off-by: Isotr0py Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- requirements/common.txt | 2 +- requirements/nightly_torch_test.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 2 +- tests/models/multimodal/generation/test_maverick.py | 2 ++ tests/models/registry.py | 12 ++++++------ tests/models/test_transformers.py | 2 +- vllm/model_executor/models/moonvit.py | 4 ++-- vllm/model_executor/models/qwen2_vl.py | 6 ++---- 9 files changed, 17 insertions(+), 17 deletions(-) diff --git a/requirements/common.txt b/requirements/common.txt index 81c4d6675006d..724360f8bc9e4 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -7,7 +7,7 @@ requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.56.0 +transformers >= 4.56.0, < 5 tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 63c1908f024b3..d9c5d89c1d52f 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -29,7 +29,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test mteb>=1.38.11, <2 # required for mteb test -transformers==4.56.2 +transformers==4.57.1 tokenizers==0.22.0 schemathesis>=3.39.15 # Required for openai schema test. # quantization diff --git a/requirements/test.in b/requirements/test.in index b1ab599ff16e5..f57ec31277ce9 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -37,7 +37,7 @@ datamodel_code_generator # required for minicpm3 test # TODO: Use lm-eval[api]==0.4.10 once released lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test mteb[bm25s]>=1.38.11, <2 # required for mteb test -transformers==4.56.2 +transformers==4.57.1 tokenizers==0.22.0 schemathesis>=3.39.15 # Required for openai schema test. # quantization diff --git a/requirements/test.txt b/requirements/test.txt index e54bb49fde684..a975f247065da 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1196,7 +1196,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.56.2 +transformers==4.57.1 # via # -r requirements/test.in # genai-perf diff --git a/tests/models/multimodal/generation/test_maverick.py b/tests/models/multimodal/generation/test_maverick.py index fd3386ff67df2..6fc2efa418ddf 100644 --- a/tests/models/multimodal/generation/test_maverick.py +++ b/tests/models/multimodal/generation/test_maverick.py @@ -186,6 +186,8 @@ def create_reduced_config( if "text_config" in config_dict: original_text_layers = config_dict["text_config"]["num_hidden_layers"] config_dict["text_config"]["num_hidden_layers"] = text_layers + original_layer_types = config_dict["text_config"]["layer_types"] + config_dict["text_config"]["layer_types"] = original_layer_types[:text_layers] print(f"Reduced text layers from {original_text_layers} to {text_layers}") original_num_experts = config_dict["text_config"]["num_local_experts"] diff --git a/tests/models/registry.py b/tests/models/registry.py index 7b5977ec58e53..8e1dd4ba91f1d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -882,27 +882,27 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { _TRANSFORMERS_BACKEND_MODELS = { "TransformersEmbeddingModel": _HfExamplesInfo( - "BAAI/bge-base-en-v1.5", min_transformers_version="4.57.0.dev0" + "BAAI/bge-base-en-v1.5", min_transformers_version="5.0.0" ), "TransformersForSequenceClassification": _HfExamplesInfo( "papluca/xlm-roberta-base-language-detection", - min_transformers_version="4.57.0.dev0", + min_transformers_version="5.0.0", ), "TransformersForCausalLM": _HfExamplesInfo( "hmellor/Ilama-3.2-1B", trust_remote_code=True ), "TransformersMultiModalForCausalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"), "TransformersMoEForCausalLM": _HfExamplesInfo( - "allenai/OLMoE-1B-7B-0924", min_transformers_version="4.57.0.dev0" + "allenai/OLMoE-1B-7B-0924", min_transformers_version="5.0.0" ), "TransformersMultiModalMoEForCausalLM": _HfExamplesInfo( - "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="4.57.0.dev0" + "Qwen/Qwen3-VL-30B-A3B-Instruct", min_transformers_version="5.0.0" ), "TransformersMoEEmbeddingModel": _HfExamplesInfo( - "Qwen/Qwen3-30B-A3B", min_transformers_version="4.57.0.dev0" + "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0" ), "TransformersMoEForSequenceClassification": _HfExamplesInfo( - "Qwen/Qwen3-30B-A3B", min_transformers_version="4.57.0.dev0" + "Qwen/Qwen3-30B-A3B", min_transformers_version="5.0.0" ), "TransformersMultiModalEmbeddingModel": _HfExamplesInfo("google/gemma-3-4b-it"), "TransformersMultiModalForSequenceClassification": _HfExamplesInfo( diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index d8a1aace83325..06e51df32d184 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -82,7 +82,7 @@ def test_models( from packaging.version import Version installed = Version(transformers.__version__) - required = Version("4.57.0.dev0") + required = Version("5.0.0") if model == "allenai/OLMoE-1B-7B-0924" and installed < required: pytest.skip( "MoE models with the Transformers backend require " diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index 96ec6e6b56acb..8017c947bf9ad 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -49,7 +49,7 @@ from functools import cached_property import torch import torch.nn as nn import torch.nn.functional as F -from transformers.activations import ACT2FN, PytorchGELUTanh +from transformers.activations import ACT2FN from transformers.modeling_utils import PreTrainedModel from transformers.utils import is_flash_attn_2_available @@ -651,7 +651,7 @@ class MoonVitPretrainedModel(PreTrainedModel): "num_heads": config.num_attention_heads, "hidden_dim": config.hidden_size, "mlp_dim": config.intermediate_size, - "activation": PytorchGELUTanh(), + "activation": ACT2FN["gelu_pytorch_tanh"], "attn_bias": True, "attn_implementation": config._attn_implementation, }, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index a81acf9f9a36d..1ec12bdb55dfe 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -34,7 +34,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from transformers import AutoConfig, BatchFeature, PretrainedConfig +from transformers import BatchFeature, PretrainedConfig from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor from transformers.models.qwen2_vl.configuration_qwen2_vl import ( Qwen2VLConfig, @@ -1651,9 +1651,7 @@ class Tarsier2Processor(Qwen2VLProcessor): class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo): def get_hf_config(self) -> Qwen2VLConfig: model_path = self.ctx.model_config.model - original_config = AutoConfig.from_pretrained(model_path) - config_dict = original_config.to_dict() - correct_config = Qwen2VLConfig.from_dict(config_dict) + correct_config = Qwen2VLConfig.from_pretrained(model_path) return correct_config From e2347dbf58eff0fa705146cde80c5292e333548b Mon Sep 17 00:00:00 2001 From: TJian Date: Fri, 31 Oct 2025 22:45:23 -0700 Subject: [PATCH 067/976] [Bugfix] [Model] Missing MRoPE function definition from `KeyeForConditionalGeneration` (#27895) Signed-off-by: tjtanaa --- .../models/multimodal/generation/test_keye.py | 86 ++++++++ vllm/model_executor/models/keye.py | 185 ++++++++++++++++-- 2 files changed, 254 insertions(+), 17 deletions(-) create mode 100644 tests/models/multimodal/generation/test_keye.py diff --git a/tests/models/multimodal/generation/test_keye.py b/tests/models/multimodal/generation/test_keye.py new file mode 100644 index 0000000000000..6f98bde1d91ea --- /dev/null +++ b/tests/models/multimodal/generation/test_keye.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import asdict +from typing import NamedTuple + +import pytest +from PIL.Image import Image +from transformers import AutoProcessor + +from vllm import LLM, EngineArgs, SamplingParams +from vllm.multimodal.utils import encode_image_base64 + +MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview" + +QUESTION = "What is the content of each image?" + + +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompt: str + image_data: list[Image] + stop_token_ids: list[int] | None = None + chat_template: str | None = None + sampling_params: SamplingParams | None = None + + +@pytest.mark.core_model +@pytest.mark.parametrize("question", [QUESTION]) +def test_keye_vl( + image_assets, + question: str, +): + images = [asset.pil_image for asset in image_assets] + + image_urls = [ + f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images + ] + + engine_args = EngineArgs( + model=MODEL_NAME, + trust_remote_code=True, + max_model_len=8192, + max_num_seqs=5, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + }, + ] + + processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + engine_args = asdict(engine_args) | {"seed": 42} + llm = LLM(**engine_args) + + sampling_params = SamplingParams( + temperature=0.0, max_tokens=256, stop_token_ids=None + ) + + outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": {"image": images}, + }, + sampling_params=sampling_params, + ) + + print("-" * 50) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + assert len(generated_text) > 10, ( + f"Generated text is too short: {generated_text}" + ) + print("-" * 50) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index acfd51a6d0cc1..5f8659a3064eb 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -17,7 +17,9 @@ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPo from transformers.utils import torch_int from vllm.attention.backends.registry import _Backend -from vllm.attention.layer import check_upstream_fa_availability +from vllm.attention.layer import ( + maybe_get_vit_flash_attn_backend, +) from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size @@ -56,12 +58,14 @@ from vllm.multimodal.processing import ( PromptUpdate, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( MultiModalEmbeddings, SupportsLoRA, + SupportsMRoPE, SupportsMultiModal, SupportsPP, ) @@ -337,7 +341,10 @@ def apply_rotary_pos_emb_flashatt( cos = cos.chunk(2, dim=-1)[0].contiguous() sin = sin.chunk(2, dim=-1)[0].contiguous() - from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb + if current_platform.is_cuda(): + from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb + elif current_platform.is_rocm(): + from flash_attn.ops.triton.rotary import apply_rotary as apply_rotary_emb q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q) k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k) @@ -398,18 +405,28 @@ class KeyeSiglipAttention(nn.Module): attn_backend_override=attn_backend_override, ) - self.use_upstream_fa = False - if self.attn_backend != _Backend.FLASH_ATTN and check_upstream_fa_availability( - torch.get_default_dtype() - ): - self.attn_backend = _Backend.FLASH_ATTN - self.use_upstream_fa = True + self.attn_backend, self.flash_attn_varlen_func = ( + maybe_get_vit_flash_attn_backend( + self.attn_backend, + use_upstream_fa=False, + attn_backend_override=attn_backend_override, + ) + ) - if self.attn_backend not in {_Backend.FLASH_ATTN, _Backend.XFORMERS}: + if self.attn_backend not in { + _Backend.FLASH_ATTN, + _Backend.XFORMERS, + _Backend.ROCM_AITER_FA, + }: raise RuntimeError( f"Keye-VL does not support {self.attn_backend} backend now." ) + self.is_flash_attn_backend = self.attn_backend in { + _Backend.FLASH_ATTN, + _Backend.ROCM_AITER_FA, + } + def forward( self, hidden_states: torch.Tensor, @@ -457,15 +474,10 @@ class KeyeSiglipAttention(nn.Module): self.head_dim, ) - if self.attn_backend == _Backend.FLASH_ATTN: - if self.use_upstream_fa: - from flash_attn import flash_attn_varlen_func - else: - from vllm.vllm_flash_attn import flash_attn_varlen_func - + if self.is_flash_attn_backend: q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) - output = flash_attn_varlen_func( + output = self.flash_attn_varlen_func( q, k, v, @@ -1542,7 +1554,7 @@ class BaseKeyeModule(nn.Module): dummy_inputs=KeyeDummyInputsBuilder, ) class KeyeForConditionalGeneration( - BaseKeyeModule, SupportsMultiModal, SupportsLoRA, SupportsPP + BaseKeyeModule, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): def _build_projector( self, @@ -1611,3 +1623,142 @@ class KeyeForConditionalGeneration( return tuple( self._process_video_embeds(video_type, video_grid_thw, pixel_values_videos) ) + + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: list[list[int]] | torch.Tensor, + video_grid_thw: list[list[int]] | torch.Tensor, + context_len: int = 0, + seq_len: int | None = None, + second_per_grid_ts: list[float] | None = None, + audio_feature_lengths: torch.Tensor | None = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0: + video_grid_thw = video_grid_thw[0] + """Get mrope input positions and delta value (Keye series).""" + + def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]: + """ + Split grid_thw along the t dimension. + + Args: + grid_thw: shape [N, 3] tensor or nested list of [t, h, w]. + + Returns: + List of [1, h, w] rows, repeated t times for each original row. + """ + + if isinstance(grid_thw, list): + grid_thw = torch.tensor(grid_thw, dtype=torch.long) + + if grid_thw.numel() == 0: + return [] + + t, hw = grid_thw[:, 0], grid_thw[:, 1:] + ones = torch.ones_like(hw[:, :1]) # [N,1] + out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0) + return out.tolist() + + video_grid_thw = split_thw(video_grid_thw) + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + + image_nums = len(image_grid_thw) + frame_nums = len(video_grid_thw) + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_frames = image_nums, frame_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + frame_nums): + if remain_images > 0: + try: + ed_image = input_tokens.index(image_token_id, st) + except ValueError: + ed_image = len(input_tokens) + 1 + else: + ed_image = len(input_tokens) + 1 + if remain_frames > 0: + try: + ed_video = input_tokens.index(video_token_id, st) + except ValueError: + ed_video = len(input_tokens) + 1 + else: + ed_video = len(input_tokens) + 1 + + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_frames -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + t_index = ( + ( + torch.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + ) + .long() + .flatten() + ) + + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx + ) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta From e67511884970af818bab20af3782ccebe08c716b Mon Sep 17 00:00:00 2001 From: Yihua Cheng Date: Sat, 1 Nov 2025 00:17:07 -0700 Subject: [PATCH 068/976] [Add] cmdline argument parsing for KV cache offloading modules (#27621) Signed-off-by: ApostaC Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/v1/kv_connector/unit/test_config.py | 65 +++++++++++++++++++++++ vllm/config/cache.py | 12 +++++ vllm/config/vllm.py | 45 ++++++++++++++++ vllm/engine/arg_utils.py | 21 +++++++- 4 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 tests/v1/kv_connector/unit/test_config.py diff --git a/tests/v1/kv_connector/unit/test_config.py b/tests/v1/kv_connector/unit/test_config.py new file mode 100644 index 0000000000000..6cf86f3d5c4ac --- /dev/null +++ b/tests/v1/kv_connector/unit/test_config.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Tests for KV cache offloading configuration.""" + +import pytest + +from vllm.config import CacheConfig, KVTransferConfig, ParallelConfig, VllmConfig + +pytestmark = pytest.mark.cpu_test + + +@pytest.mark.parametrize( + "kv_offloading_backend,kv_offloading_size,tp,pp,expected_backend,expected_bytes", + [ + ("native", 4.0, 1, 1, "OffloadingConnector", 4.0 * (1 << 30)), + # bytes per rank: 8.0 GiB / (2 * 2) = 2.0 GiB + ("native", 8.0, 2, 2, "OffloadingConnector", 8.0 * (1 << 30) / 4), + ("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0), + # size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB + ("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0), + (None, None, 1, 1, None, None), + ], +) +def test_kv_connector( + kv_offloading_backend, kv_offloading_size, tp, pp, expected_backend, expected_bytes +): + kv_transfer_config = ( + KVTransferConfig(kv_connector_extra_config={"existing_key": "existing_value"}) + if expected_backend is not None + else None + ) + + vllm_config = VllmConfig( + cache_config=CacheConfig( + kv_offloading_backend=kv_offloading_backend, + kv_offloading_size=kv_offloading_size, + ), + kv_transfer_config=kv_transfer_config, + parallel_config=ParallelConfig( + tensor_parallel_size=tp, pipeline_parallel_size=pp + ), + ) + + # No KV transfer config expected + if expected_backend is None: + assert vllm_config.kv_transfer_config is expected_backend + return + + kv_transfer_config = vllm_config.kv_transfer_config + kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config + + assert kv_transfer_config.kv_connector == expected_backend + assert kv_transfer_config.kv_role == "kv_both" + + if kv_offloading_backend == "native": + assert kv_connector_extra_config["kv_bytes_per_rank"] == expected_bytes + assert kv_connector_extra_config["num_cpu_blocks"] == 0 + # Existing config should be preserved + assert kv_connector_extra_config["existing_key"] == "existing_value" + elif kv_offloading_backend == "lmcache": + assert kv_connector_extra_config["lmcache.local_cpu"] is True + assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes + # Existing config should be replaced + assert "existing_key" not in kv_connector_extra_config diff --git a/vllm/config/cache.py b/vllm/config/cache.py index d743d5aa9dd29..031df3091f1c6 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -24,6 +24,7 @@ BlockSize = Literal[1, 8, 16, 32, 64, 128, 256] CacheDType = Literal["auto", "bfloat16", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"] MambaDType = Literal["auto", "float32"] PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"] +KVOffloadingBackend = Literal["native", "lmcache"] @config @@ -128,6 +129,17 @@ class CacheConfig: gpu_memory_utilization. Note that kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization""" + kv_offloading_size: float | None = None + """Size of the KV cache offloading buffer in GiB. When TP > 1, this is + the total buffer size summed across all TP ranks. By default, this is set + to None, which means no KV offloading is enabled. When set with + kv_offloading_backend, vLLM will enable KV cache offloading to CPU""" + + kv_offloading_backend: KVOffloadingBackend | None = None + """The backend to use for KV cache offloading. Supported backends include + 'native' (vLLM native CPU offloading), 'lmcache' This option must be used + together with kv_offloading_size.""" + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 1acac70c32b03..ee91cb0ef5c36 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -289,6 +289,48 @@ class VllmConfig: return replace(self, model_config=model_config) + def _post_init_kv_transfer_config(self) -> None: + """Update KVTransferConfig based on top-level configs in VllmConfig. + + Right now, this function reads the offloading settings from + CacheConfig and configures the KVTransferConfig accordingly. + """ + if (kv_offloading_backend := self.cache_config.kv_offloading_backend) is None: + return + + # If no KVTransferConfig is provided, create a default one. + if self.kv_transfer_config is None: + self.kv_transfer_config = KVTransferConfig() + + if (kv_offloading_size := self.cache_config.kv_offloading_size) is None: + raise ValueError( + "You must set kv_offloading_size when kv_offloading_backend is set." + ) + num_kv_ranks = ( + self.parallel_config.tensor_parallel_size + * self.parallel_config.pipeline_parallel_size + ) + + if kv_offloading_backend == "native": + self.kv_transfer_config.kv_connector = "OffloadingConnector" + kv_bytes_per_rank = kv_offloading_size * (1 << 30) / num_kv_ranks + + # NOTE(ApostaC): the actual calculation for num_cpu_blocks should be + # done after the model's KV cache is initialized + self.kv_transfer_config.kv_connector_extra_config.update( + {"kv_bytes_per_rank": kv_bytes_per_rank, "num_cpu_blocks": 0} + ) + elif kv_offloading_backend == "lmcache": + self.kv_transfer_config.kv_connector = "LMCacheConnectorV1" + kv_gb_per_rank = kv_offloading_size / num_kv_ranks + self.kv_transfer_config.kv_connector_extra_config = { + "lmcache.local_cpu": True, + "lmcache.max_local_cpu_size": kv_gb_per_rank, + } + + # This is the same for all backends + self.kv_transfer_config.kv_role = "kv_both" + def __post_init__(self): """Verify configs are valid & consistent with each other.""" @@ -646,6 +688,9 @@ class VllmConfig: if "-quant_fp8" not in custom_ops: custom_ops.append("+quant_fp8") + # Handle the KV connector configs + self._post_init_kv_transfer_config() + def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list: # remove the sizes that not multiple of tp_size when # enable sequence parallelism diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b6f922a95519b..66c75d944ec8b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -54,7 +54,13 @@ from vllm.config import ( VllmConfig, get_attr_docs, ) -from vllm.config.cache import BlockSize, CacheDType, MambaDType, PrefixCachingHashAlgo +from vllm.config.cache import ( + BlockSize, + CacheDType, + KVOffloadingBackend, + MambaDType, + PrefixCachingHashAlgo, +) from vllm.config.device import Device from vllm.config.model import ( ConvertOption, @@ -553,6 +559,11 @@ class EngineArgs: kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill + kv_offloading_size: float | None = CacheConfig.kv_offloading_size + kv_offloading_backend: KVOffloadingBackend | None = ( + CacheConfig.kv_offloading_backend + ) + def __post_init__(self): # support `EngineArgs(compilation_config={...})` # without having to manually construct a @@ -896,6 +907,12 @@ class EngineArgs: cache_group.add_argument( "--mamba-block-size", **cache_kwargs["mamba_block_size"] ) + cache_group.add_argument( + "--kv-offloading-size", **cache_kwargs["kv_offloading_size"] + ) + cache_group.add_argument( + "--kv-offloading-backend", **cache_kwargs["kv_offloading_backend"] + ) # Multimodal related configs multimodal_kwargs = get_kwargs(MultiModalConfig) @@ -1387,6 +1404,8 @@ class EngineArgs: mamba_cache_dtype=self.mamba_cache_dtype, mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype, mamba_block_size=self.mamba_block_size, + kv_offloading_size=self.kv_offloading_size, + kv_offloading_backend=self.kv_offloading_backend, ) ray_runtime_env = None From 2c0c7c39bdf78ff4cf99a93f67066435e1712cd8 Mon Sep 17 00:00:00 2001 From: ai-jz <156989844+ai-jz@users.noreply.github.com> Date: Sat, 1 Nov 2025 01:04:52 -0700 Subject: [PATCH 069/976] feat(benchmarks): support HF model names in multi-turn benchmark (#27850) --- benchmarks/multi_turn/benchmark_serving_multi_turn.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 67a085b40ed35..5d2ac66e5ab94 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -1429,8 +1429,6 @@ async def main() -> None: random.seed(args.seed) np.random.seed(args.seed) - if not os.path.exists(args.model): - raise OSError(f"Path does not exist: {args.model}") logger.info("Loading tokenizer") tokenizer = AutoTokenizer.from_pretrained(args.model) From 799ce45cc160ffc0a3e1a0f958cc8e260b751808 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 1 Nov 2025 10:02:23 +0000 Subject: [PATCH 070/976] [Docs] Mock all imports for docs (#27873) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/mkdocs/hooks/generate_argparse.py | 60 ++++++++++++++++++++------ requirements/docs.txt | 8 ---- vllm/utils/cache.py | 4 +- 3 files changed, 49 insertions(+), 23 deletions(-) diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index ea89108f01fc2..ce1c5c53cf35a 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -3,6 +3,7 @@ import importlib import logging import sys +import traceback from argparse import SUPPRESS, HelpFormatter from pathlib import Path from typing import Literal @@ -16,7 +17,30 @@ ROOT_DIR = Path(__file__).parent.parent.parent.parent ARGPARSE_DOC_DIR = ROOT_DIR / "docs/argparse" sys.path.insert(0, str(ROOT_DIR)) + + +# Mock custom op code +class MockCustomOp: + @staticmethod + def register(name): + def decorator(cls): + return cls + + return decorator + + +noop = lambda *a, **k: None sys.modules["vllm._C"] = MagicMock() +sys.modules["vllm.model_executor.custom_op"] = MagicMock(CustomOp=MockCustomOp) +sys.modules["vllm.utils.torch_utils"] = MagicMock(direct_register_custom_op=noop) + +# Mock any version checks by reading from compiled CI requirements +with open(ROOT_DIR / "requirements/test.txt") as f: + VERSIONS = dict(line.strip().split("==") for line in f if "==" in line) +importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0" + +# Make torch.nn.Parameter safe to inherit from +sys.modules["torch.nn"] = MagicMock(Parameter=object) class PydanticMagicMock(MagicMock): @@ -31,20 +55,17 @@ class PydanticMagicMock(MagicMock): return core_schema.any_schema() -def auto_mock(module, attr, max_mocks=50): +def auto_mock(module, attr, max_mocks=100): """Function that automatically mocks missing modules during imports.""" logger.info("Importing %s from %s", attr, module) for _ in range(max_mocks): try: # First treat attr as an attr, then as a submodule - with patch("importlib.metadata.version", return_value="0.0.0"): - return getattr( - importlib.import_module(module), - attr, - importlib.import_module(f"{module}.{attr}"), - ) - except importlib.metadata.PackageNotFoundError as e: - raise e + return getattr( + importlib.import_module(module), + attr, + importlib.import_module(f"{module}.{attr}"), + ) except ModuleNotFoundError as e: logger.info("Mocking %s for argparse doc generation", e.name) sys.modules[e.name] = PydanticMagicMock(name=e.name) @@ -139,10 +160,19 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser: Returns: FlexibleArgumentParser: A parser with markdown formatting for the class. """ - parser = FlexibleArgumentParser(add_json_tip=False) - parser.formatter_class = MarkdownFormatter - with patch("vllm.config.DeviceConfig.__post_init__"): - _parser = add_cli_args(parser, **kwargs) + try: + parser = FlexibleArgumentParser(add_json_tip=False) + parser.formatter_class = MarkdownFormatter + with patch("vllm.config.DeviceConfig.__post_init__"): + _parser = add_cli_args(parser, **kwargs) + except ModuleNotFoundError as e: + # Auto-mock runtime imports + if tb_list := traceback.extract_tb(e.__traceback__): + path = Path(tb_list[-1].filename).relative_to(ROOT_DIR) + auto_mock(module=".".join(path.parent.parts), attr=path.stem) + return create_parser(add_cli_args, **kwargs) + else: + raise e # add_cli_args might be in-place so return parser if _parser is None return _parser or parser @@ -184,3 +214,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): with open(doc_path, "w", encoding="utf-8") as f: f.write(super(type(parser), parser).format_help()) logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR)) + + +if __name__ == "__main__": + on_startup("build", False) diff --git a/requirements/docs.txt b/requirements/docs.txt index 00c314874016f..0fd6dbe22c512 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -9,12 +9,4 @@ mkdocs-git-revision-date-localized-plugin mkdocs-minify-plugin regex ruff - -# Required for argparse hook only --f https://download.pytorch.org/whl/cpu -cachetools -cloudpickle -py-cpuinfo -msgspec pydantic -torch diff --git a/vllm/utils/cache.py b/vllm/utils/cache.py index d5e08caa8a1ed..4338983f90601 100644 --- a/vllm/utils/cache.py +++ b/vllm/utils/cache.py @@ -3,7 +3,7 @@ from collections import UserDict from collections.abc import Callable, Hashable, Iterator, KeysView, Mapping from types import MappingProxyType -from typing import Generic, NamedTuple, TypeVar, cast, overload +from typing import NamedTuple, TypeVar, cast, overload import cachetools @@ -48,7 +48,7 @@ class CacheInfo(NamedTuple): ) -class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]): +class LRUCache(cachetools.LRUCache[_K, _V]): def __init__(self, capacity: float, getsizeof: Callable[[_V], float] | None = None): super().__init__(capacity, getsizeof) From 30a14b034fa387470a512e8004527ad1c28af303 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Sat, 1 Nov 2025 18:17:45 +0800 Subject: [PATCH 071/976] [V0 deprecation] Remove VLLM_USE_V1 usage in platform and v1 module (#27798) Signed-off-by: wangxiyuan Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/platforms/cuda.py | 190 ++++++++++++--------------- vllm/platforms/interface.py | 9 +- vllm/platforms/rocm.py | 84 +++++------- vllm/platforms/tpu.py | 4 - vllm/platforms/xpu.py | 9 +- vllm/v1/engine/async_llm.py | 16 --- vllm/v1/engine/llm_engine.py | 11 +- vllm/v1/executor/uniproc_executor.py | 9 +- 8 files changed, 128 insertions(+), 204 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index cc06f034fba32..32734c3aba5ef 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -276,17 +276,12 @@ class CudaPlatformBase(Platform): "FLASHMLA, FLASH_ATTN_MLA, or TRITON_MLA. Alternatively, set " "VLLM_MLA_DISABLE=1 to disable MLA for this model." ) - if not use_v1: - raise RuntimeError( - "MLA attention backends require the V1 engine. " - "Set VLLM_USE_V1=1 to enable them." - ) from vllm.attention.ops.flashmla import is_flashmla_dense_supported from vllm.attention.utils.fa_utils import flash_attn_supports_mla if use_sparse: - logger.info_once("Using Sparse MLA backend on V1 engine.") + logger.info_once("Using Sparse MLA backend.") return ( "vllm.v1.attention.backends.mla.flashmla_sparse." "FlashMLASparseBackend" @@ -313,15 +308,13 @@ class CudaPlatformBase(Platform): ) if use_cutlassmla: - logger.info_once( - "Using Cutlass MLA backend on V1 engine.", scope="local" - ) + logger.info_once("Using Cutlass MLA backend.", scope="local") return "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend" if use_flashinfermla: from vllm.v1.attention.backends.utils import set_kv_cache_layout set_kv_cache_layout("HND") - logger.info_once("Using FlashInfer MLA backend on V1 engine.") + logger.info_once("Using FlashInfer MLA backend.") return ( "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend" ) @@ -333,116 +326,107 @@ class CudaPlatformBase(Platform): block_size, ) else: - logger.info_once("Using FlashMLA backend on V1 engine.") + logger.info_once("Using FlashMLA backend.") return "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend" if use_flashattn: - logger.info_once("Using FlashAttention MLA backend on V1 engine.") + logger.info_once("Using FlashAttention MLA backend.") return ( "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend" ) if use_triton: - logger.info_once("Using Triton MLA backend on V1 engine.") + logger.info_once("Using Triton MLA backend.") return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend" - if use_v1: - FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend" # noqa: E501 - FLEX_ATTENTION_V1 = ( - "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501 - ) - TRITON_ATTN = ( - "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501 - ) - FLASH_ATTN_V1 = ( - "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501 - ) - TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501 - XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501 - use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith( - "fp8" - ) + FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend" # noqa: E501 + FLEX_ATTENTION_V1 = ( + "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501 + ) + TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501 + FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501 + TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501 + XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501 - if selected_backend == _Backend.FLASHINFER: - logger.info_once("Using FlashInfer backend on V1 engine.") - if cls.has_device_capability(100): - from vllm.v1.attention.backends.utils import set_kv_cache_layout + use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith( + "fp8" + ) + + if selected_backend == _Backend.FLASHINFER: + logger.info_once("Using FlashInfer backend.") + if cls.has_device_capability(100): + from vllm.v1.attention.backends.utils import set_kv_cache_layout + + set_kv_cache_layout("HND") + return FLASHINFER_V1 + elif selected_backend == _Backend.FLEX_ATTENTION: + logger.info_once("Using FlexAttention backend.") + return FLEX_ATTENTION_V1 + elif selected_backend == _Backend.TRITON_ATTN: + logger.info_once("Using Triton backend.") + return TRITON_ATTN + elif selected_backend == _Backend.FLASH_ATTN: + logger.info_once("Using Flash Attention backend.") + return FLASH_ATTN_V1 + elif selected_backend == _Backend.TREE_ATTN: + logger.info_once("Using Tree Attention backend.") + return TREE_ATTN_V1 + elif selected_backend == _Backend.XFORMERS: + logger.info_once("Using XFormers backend.") + return XFORMERS_V1 + + from vllm.attention.selector import is_attn_backend_supported + + # Default backends for V1 engine + # Prefer FlashInfer for Blackwell GPUs if installed + if cls.is_device_capability(100): + if is_default_backend_supported := is_attn_backend_supported( + FLASHINFER_V1, head_size, dtype + ): + from vllm.v1.attention.backends.utils import set_kv_cache_layout + + logger.info_once( + "Using FlashInfer backend with HND KV cache layout on " + "V1 engine by default for Blackwell (SM 10.0) GPUs." + ) + set_kv_cache_layout("HND") - set_kv_cache_layout("HND") return FLASHINFER_V1 - elif selected_backend == _Backend.FLEX_ATTENTION: - logger.info_once("Using FlexAttention backend on V1 engine.") - return FLEX_ATTENTION_V1 - elif selected_backend == _Backend.TRITON_ATTN: - logger.info_once("Using Triton backend on V1 engine.") + + if not is_default_backend_supported.can_import: + logger.warning_once( + "FlashInfer failed to import on Blackwell (SM 10.0) GPUs; " + "it is recommended to install FlashInfer for better " + "performance." + ) + + # FlashAttention is the default for SM 8.0+ GPUs + if cls.has_device_capability(80): + if (has_sink or use_fp8_kv_cache) and not cls.is_device_capability(90): + logger.info_once("Using Triton backend.") return TRITON_ATTN - elif selected_backend == _Backend.FLASH_ATTN: - logger.info_once("Using Flash Attention backend on V1 engine.") + elif is_default_backend_supported := is_attn_backend_supported( + FLASH_ATTN_V1, head_size, dtype, allow_import_error=False + ): + logger.info_once("Using Flash Attention backend.") return FLASH_ATTN_V1 - elif selected_backend == _Backend.TREE_ATTN: - logger.info_once("Using Tree Attention backend on V1 engine.") - return TREE_ATTN_V1 - elif selected_backend == _Backend.XFORMERS: - logger.info_once("Using XFormers backend on V1 engine.") - return XFORMERS_V1 - from vllm.attention.selector import is_attn_backend_supported - - # Default backends for V1 engine - # Prefer FlashInfer for Blackwell GPUs if installed - if cls.is_device_capability(100): - if is_default_backend_supported := is_attn_backend_supported( - FLASHINFER_V1, head_size, dtype - ): - from vllm.v1.attention.backends.utils import set_kv_cache_layout - - logger.info_once( - "Using FlashInfer backend with HND KV cache layout on " - "V1 engine by default for Blackwell (SM 10.0) GPUs." - ) - set_kv_cache_layout("HND") - - return FLASHINFER_V1 - - if not is_default_backend_supported.can_import: - logger.warning_once( - "FlashInfer failed to import for V1 engine on " - "Blackwell (SM 10.0) GPUs; it is recommended to " - "install FlashInfer for better performance." - ) - - # FlashAttention is the default for SM 8.0+ GPUs - if cls.has_device_capability(80): - if (has_sink or use_fp8_kv_cache) and not cls.is_device_capability(90): - logger.info_once("Using Triton backend on V1 engine.") - return TRITON_ATTN - elif is_default_backend_supported := is_attn_backend_supported( - FLASH_ATTN_V1, head_size, dtype, allow_import_error=False - ): - logger.info_once("Using Flash Attention backend on V1 engine.") - return FLASH_ATTN_V1 - - # FlexAttention is the default for older GPUs - else: - logger.info_once("Using FlexAttention backend on V1 engine.") - return FLEX_ATTENTION_V1 - - assert not is_default_backend_supported - - use_flex_attention_reason = {} - if not is_default_backend_supported.head_size: - use_flex_attention_reason["head_size"] = head_size - if not is_default_backend_supported.dtype: - use_flex_attention_reason["dtype"] = dtype - - logger.info_once( - "Using FlexAttention backend for %s on V1 engine.", - ", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()), - ) + # FlexAttention is the default for older GPUs + else: + logger.info_once("Using FlexAttention backend.") return FLEX_ATTENTION_V1 - raise RuntimeError( - "V0 attention backends have been removed. Set VLLM_USE_V1=1 " - "to select a supported backend." + assert not is_default_backend_supported + + use_flex_attention_reason = {} + if not is_default_backend_supported.head_size: + use_flex_attention_reason["head_size"] = head_size + if not is_default_backend_supported.dtype: + use_flex_attention_reason["dtype"] = dtype + + logger.info_once( + "Using FlexAttention backend for %s.", + ", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()), ) + return FLEX_ATTENTION_V1 @classmethod def get_punica_wrapper(cls) -> str: diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 4462829564391..15e3b3a22bdee 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -467,14 +467,7 @@ class Platform: """ Whether to use allgather in LogitsProcessor to gather the logits. """ - import vllm.envs as envs - from vllm.config import get_current_vllm_config - - parallel_config = get_current_vllm_config().parallel_config - return ( - envs.VLLM_USE_V1 - or parallel_config.distributed_executor_backend == "external_launcher" - ) + return True @classmethod def use_custom_allreduce(cls) -> bool: diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d3535c9781c48..0c03a5564db89 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -149,7 +149,7 @@ def use_rocm_custom_paged_attention( # disabled due to observed numerical discrepancy. if ON_GFX9: return ( - (not envs.VLLM_USE_V1 or sliding_window == 0 or sliding_window == (-1, -1)) + (sliding_window == 0 or sliding_window == (-1, -1)) and (qtype == torch.half or qtype == torch.bfloat16) and (head_size == 64 or head_size == 128) and (block_size == 16 or block_size == 32) @@ -163,11 +163,7 @@ def use_rocm_custom_paged_attention( else: return ( ON_GFX11_GFX12 - and ( - not envs.VLLM_USE_V1 - or sliding_window == 0 - or sliding_window == (-1, -1) - ) + and (sliding_window == 0 or sliding_window == (-1, -1)) and (qtype == torch.half or qtype == torch.bfloat16) and head_size == 128 and block_size == 16 @@ -236,12 +232,6 @@ class RocmPlatform(Platform): if use_sparse: raise NotImplementedError("Sparse Attention is not supported on ROCm.") if use_mla: - if not use_v1: - raise RuntimeError( - "MLA attention backends require the V1 engine. " - "Set VLLM_USE_V1=1 to enable them." - ) - from vllm.v1.attention.backends.mla.rocm_aiter_mla import ( is_aiter_mla_enabled, ) @@ -255,7 +245,7 @@ class RocmPlatform(Platform): if selected_backend == _Backend.TRITON_MLA: if block_size != 1: - logger.info_once("Using Triton MLA backend on V1 engine.") + logger.info_once("Using Triton MLA backend.") return "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend" raise ValueError( f" The selected backend, {selected_backend.name}," @@ -263,7 +253,7 @@ class RocmPlatform(Platform): ) if selected_backend == _Backend.ROCM_AITER_MLA: if block_size == 1: - logger.info("Using AITER MLA backend on V1 engine.") + logger.info("Using AITER MLA backend.") return ( "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" # noqa: E501 ) @@ -277,41 +267,33 @@ class RocmPlatform(Platform): f"is not MLA type while requested for MLA backend." ) - if envs.VLLM_USE_V1: - if selected_backend == _Backend.FLEX_ATTENTION: - logger.info("Using FlexAttention backend on V1 engine.") - return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" - if ( - envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9() - ) or selected_backend == _Backend.ROCM_AITER_FA: - logger.info("Using Aiter Flash Attention backend on V1 engine.") - return ( - "vllm.v1.attention.backends." - "rocm_aiter_fa.AiterFlashAttentionBackend" - ) - if ( - envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION - ) or selected_backend == _Backend.ROCM_AITER_UNIFIED_ATTN: - logger.info("Using Aiter Unified Attention backend on V1 engine.") - return ( - "vllm.v1.attention.backends." - "rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend" - ) - if ( - envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION - or selected_backend == _Backend.ROCM_ATTN - ): - # rocm specific backend, with aiter and/or - # triton prefix-prefill - logger.info("Using Rocm Attention backend on V1 engine.") - return "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend" - # default case, using triton unified attention - logger.info("Using Triton Attention backend on V1 engine.") - return "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" - raise RuntimeError( - "V0 attention backends have been removed. Set VLLM_USE_V1=1 " - "to select a supported backend." - ) + if selected_backend == _Backend.FLEX_ATTENTION: + logger.info("Using FlexAttention backend.") + return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" + if ( + envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9() + ) or selected_backend == _Backend.ROCM_AITER_FA: + logger.info("Using Aiter Flash Attention backend.") + return "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend" + if ( + envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION + ) or selected_backend == _Backend.ROCM_AITER_UNIFIED_ATTN: + logger.info("Using Aiter Unified Attention backend.") + return ( + "vllm.v1.attention.backends." + "rocm_aiter_unified_attn.RocmAiterUnifiedAttentionBackend" + ) + if ( + envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION + or selected_backend == _Backend.ROCM_ATTN + ): + # rocm specific backend, with aiter and/or + # triton prefix-prefill + logger.info("Using Rocm Attention backend.") + return "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend" + # default case, using triton unified attention + logger.info("Using Triton Attention backend.") + return "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" @classmethod def set_device(cls, device: torch.device) -> None: @@ -372,7 +354,6 @@ class RocmPlatform(Platform): parallel_config = vllm_config.parallel_config is_eager_execution = compilation_config == CUDAGraphMode.NONE - use_v1 = envs.VLLM_USE_V1 use_aiter_rms_norm = ( envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_RMSNORM ) @@ -384,8 +365,7 @@ class RocmPlatform(Platform): parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" # Aiter rms norm perform best when CUDA Graph capture is enabled. if ( - use_v1 - and use_aiter_rms_norm + use_aiter_rms_norm and not is_eager_execution and "-rms_norm" not in compilation_config.custom_ops ): diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 0a14ee011f7f2..1a4b67a1762f3 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -204,10 +204,6 @@ class TpuPlatform(Platform): def get_device_communicator_cls(cls) -> str: return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator" # noqa - @classmethod - def use_all_gather(cls) -> bool: - return True - @classmethod def validate_request( cls, diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 07ab759e4baa6..e4ecd0c807dac 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -66,16 +66,13 @@ class XPUPlatform(Platform): if use_sparse: raise NotImplementedError("Sparse Attention is not supported on XPU.") - use_v1 = envs.VLLM_USE_V1 - if not use_v1: - raise ValueError("XPU backend only supports V1.") TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501 FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501 if selected_backend == _Backend.TRITON_ATTN: - logger.info_once("Using Triton backend on V1 engine.") + logger.info_once("Using Triton backend.") return TRITON_ATTN elif selected_backend == _Backend.FLASH_ATTN: - logger.info_once("Using Flash Attention backend on V1 engine.") + logger.info_once("Using Flash Attention backend.") return FLASH_ATTN elif selected_backend: raise ValueError( @@ -83,7 +80,7 @@ class XPUPlatform(Platform): f"with use_v1: {use_v1} use_mla: {use_mla}" ) - logger.info("Using Flash Attention backend on V1 engine.") + logger.info("Using Flash Attention backend.") return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" @classmethod diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index dc61d45015682..f0d5b77e8e183 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -88,14 +88,6 @@ class AsyncLLM(EngineClient): Returns: None """ - if not envs.VLLM_USE_V1: - raise ValueError( - "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. " - "This should not happen. As a workaround, try using " - "AsyncLLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github." - ) - # Ensure we can serialize custom transformer configs maybe_register_config_serialize_by_value() @@ -206,14 +198,6 @@ class AsyncLLM(EngineClient): client_index: int = 0, disable_log_requests: bool = True, # Deprecated, will be removed ) -> "AsyncLLM": - if not envs.VLLM_USE_V1: - raise ValueError( - "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. " - "This should not happen. As a workaround, try using " - "AsyncLLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github." - ) - # Create the LLMEngine. return cls( vllm_config=vllm_config, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c2ca9579d55ea..f44b6b2070d9f 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -58,18 +58,9 @@ class LLMEngine: use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: - if not envs.VLLM_USE_V1: - raise ValueError( - "Using V1 LLMEngine, but envs.VLLM_USE_V1=False. " - "This should not happen. As a workaround, try using " - "LLMEngine.from_vllm_config(...) or explicitly set " - "VLLM_USE_V1=0 or 1 and report this issue on Github." - ) - if stat_loggers is not None: raise NotImplementedError( - "Passing StatLoggers to LLMEngine in V1 is not yet supported. " - "Set VLLM_USE_V1=0 and file and issue on Github." + "Passing StatLoggers to LLMEngine is not yet supported." ) self.vllm_config = vllm_config diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py index f17d3c3092701..32f00949b7f74 100644 --- a/vllm/v1/executor/uniproc_executor.py +++ b/vllm/v1/executor/uniproc_executor.py @@ -124,11 +124,10 @@ class ExecutorWithExternalLauncher(UniProcExecutor): def _init_executor(self) -> None: """Initialize the worker and load the model.""" - if envs.VLLM_USE_V1: - assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, ( - "To get deterministic execution in V1, " - "please set VLLM_ENABLE_V1_MULTIPROCESSING=0" - ) + assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, ( + "To get deterministic execution, " + "please set VLLM_ENABLE_V1_MULTIPROCESSING=0" + ) super()._init_executor() def _distributed_args(self) -> tuple[str, int, int]: From d811b442d305b33b3aca2836c5d7f761effe76de Mon Sep 17 00:00:00 2001 From: Haco <75477391+xiaohajiayou@users.noreply.github.com> Date: Sat, 1 Nov 2025 22:52:43 +0800 Subject: [PATCH 072/976] [Bugfix] DeepSeek V3.2 MTP metadata & CUDA graph issues (#26779) Signed-off-by: xiaohajiayou <923390377@qq.com> --- vllm/v1/spec_decode/eagle.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 35c2e73e8ee2c..1e18eea2330a4 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -109,6 +109,7 @@ class EagleProposer: else [] ) + self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes) # persistent buffers for cuda graph self.input_ids = torch.zeros( self.max_num_tokens, dtype=torch.int32, device=device @@ -939,7 +940,7 @@ class EagleProposer: self.vllm_config, DeepseekV32IndexerCache ) draft_indexer_layer_names = indexer_layers.keys() - target_indexer_layer_names - self.attn_layer_names = list(draft_attn_layer_names) + self.attn_layer_names = list(draft_attn_layer_names - draft_indexer_layer_names) self.indexer_layer_names = list(draft_indexer_layer_names) if self.indexer_layer_names: @@ -1050,16 +1051,18 @@ class EagleProposer: num_tokens: int, use_cudagraphs=True, ) -> None: - if use_cudagraphs and num_tokens <= self.cudagraph_batch_sizes[-1]: + # Determine if CUDA graphs should be used for this run. + cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph + if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]: num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) with set_forward_context( None, self.vllm_config, num_tokens=num_tokens, - cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE - if use_cudagraphs - else CUDAGraphMode.NONE, + cudagraph_runtime_mode=( + CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE + ), ): if self.supports_mm_inputs: input_ids = None From 99d69af9ece094acb94901439925f8468b32326a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 1 Nov 2025 23:28:54 +0800 Subject: [PATCH 073/976] [Bugfix] Python 3.10 compatibility for `Self` (#27918) Signed-off-by: DarkLight1337 --- vllm/config/structured_outputs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py index 85b6e42264a42..eb1cc7220b8fe 100644 --- a/vllm/config/structured_outputs.py +++ b/vllm/config/structured_outputs.py @@ -2,10 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib -from typing import Any, Literal, Self +from typing import Any, Literal from pydantic import model_validator from pydantic.dataclasses import dataclass +from typing_extensions import Self from vllm.config.utils import config From af6e19f50f1d5d0c3801948c3ab17b2af231c259 Mon Sep 17 00:00:00 2001 From: wenxindongwork <161090399+wenxindongwork@users.noreply.github.com> Date: Sat, 1 Nov 2025 11:14:44 -0600 Subject: [PATCH 074/976] [Core][TPU] Support TPU Data Parallalism (#27365) Signed-off-by: wenxindongwork --- vllm/entrypoints/llm.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 758e16c89e694..b0b996ab2fec5 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -67,6 +67,7 @@ from vllm.outputs import ( RequestOutput, ScoringRequestOutput, ) +from vllm.platforms import current_platform from vllm.pooling_params import PoolingParams from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams from vllm.tasks import PoolingTask @@ -289,7 +290,11 @@ class LLM: # warn about single-process data parallel usage. _dp_size = int(kwargs.get("data_parallel_size", 1)) _distributed_executor_backend = kwargs.get("distributed_executor_backend") - if _dp_size > 1 and not _distributed_executor_backend == "external_launcher": + if ( + _dp_size > 1 + and not _distributed_executor_backend == "external_launcher" + and not current_platform.is_tpu() + ): raise ValueError( f"LLM(data_parallel_size={_dp_size}) is not supported for single-" "process usage and may hang. Please use " From c2ed069b32e2805c05a858c6157f4c6393b145a8 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 1 Nov 2025 10:51:24 -0700 Subject: [PATCH 075/976] [BugFix] Fix mixed penalties batch with async scheduling (#27910) Signed-off-by: Nick Hill --- vllm/v1/sample/ops/penalties.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py index 898b90d41abae..241d9de957ea2 100644 --- a/vllm/v1/sample/ops/penalties.py +++ b/vllm/v1/sample/ops/penalties.py @@ -21,6 +21,14 @@ def apply_all_penalties( """ _, vocab_size = logits.shape output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size, logits.device) + + # In the async scheduling case, rows that won't have penalties applied may contain + # -1 placeholder token ids. We must replace these with valid token ids so that the + # scatter done in apply_penalties is valid. + # NOTE(nick): The penalties implementation is currently quite inefficient and + # will be reworked anyhow. + output_tokens_t.masked_fill_(output_tokens_t == -1, vocab_size) + return apply_penalties( logits, prompt_token_ids, From 1e88fb751bce13c74355d177fd06035858ce77c4 Mon Sep 17 00:00:00 2001 From: Benjamin Bartels Date: Sat, 1 Nov 2025 19:45:42 +0000 Subject: [PATCH 076/976] Adds anthropic /v1/messages endpoint to openai api_server (#27882) Signed-off-by: bbartels Signed-off-by: Benjamin Bartels --- tests/entrypoints/anthropic/__init__.py | 0 .../{anthropic => openai}/test_messages.py | 72 ++--- tests/utils.py | 142 +-------- vllm/entrypoints/anthropic/api_server.py | 301 ------------------ vllm/entrypoints/openai/api_server.py | 86 +++++ 5 files changed, 139 insertions(+), 462 deletions(-) delete mode 100644 tests/entrypoints/anthropic/__init__.py rename tests/entrypoints/{anthropic => openai}/test_messages.py (68%) delete mode 100644 vllm/entrypoints/anthropic/api_server.py diff --git a/tests/entrypoints/anthropic/__init__.py b/tests/entrypoints/anthropic/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/entrypoints/anthropic/test_messages.py b/tests/entrypoints/openai/test_messages.py similarity index 68% rename from tests/entrypoints/anthropic/test_messages.py rename to tests/entrypoints/openai/test_messages.py index 4e35554b4e330..3e390ad496428 100644 --- a/tests/entrypoints/anthropic/test_messages.py +++ b/tests/entrypoints/openai/test_messages.py @@ -5,7 +5,7 @@ import anthropic import pytest import pytest_asyncio -from ...utils import RemoteAnthropicServer +from ...utils import RemoteOpenAIServer MODEL_NAME = "Qwen/Qwen3-0.6B" @@ -23,13 +23,13 @@ def server(): # noqa: F811 "claude-3-7-sonnet-latest", ] - with RemoteAnthropicServer(MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @pytest_asyncio.fixture async def client(server): - async with server.get_async_client() as async_client: + async with server.get_async_client_anthropic() as async_client: yield async_client @@ -105,37 +105,37 @@ async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic): print(f"Anthropic response: {resp.model_dump_json()}") - @pytest.mark.asyncio - async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic): - resp = await client.messages.create( - model="claude-3-7-sonnet-latest", - max_tokens=1024, - messages=[ - { - "role": "user", - "content": "What's the weather like in New York today?", - } - ], - tools=[ - { - "name": "get_current_weather", - "description": "Useful for querying the weather " - "in a specified city.", - "input_schema": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "City or region, for example: " - "New York, London, Tokyo, etc.", - } - }, - "required": ["location"], - }, - } - ], - stream=True, - ) - async for chunk in resp: - print(chunk.model_dump_json()) +@pytest.mark.asyncio +async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic): + resp = await client.messages.create( + model="claude-3-7-sonnet-latest", + max_tokens=1024, + messages=[ + { + "role": "user", + "content": "What's the weather like in New York today?", + } + ], + tools=[ + { + "name": "get_current_weather", + "description": "Useful for querying the weather in a specified city.", + "input_schema": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City or region, for example: " + "New York, London, Tokyo, etc.", + } + }, + "required": ["location"], + }, + } + ], + stream=True, + ) + + async for chunk in resp: + print(chunk.model_dump_json()) diff --git a/tests/utils.py b/tests/utils.py index af4ce6ebaeda2..c8f18384c5114 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -247,6 +247,23 @@ class RemoteOpenAIServer: **kwargs, ) + def get_client_anthropic(self, **kwargs): + if "timeout" not in kwargs: + kwargs["timeout"] = 600 + return anthropic.Anthropic( + base_url=self.url_for(), + api_key=self.DUMMY_API_KEY, + max_retries=0, + **kwargs, + ) + + def get_async_client_anthropic(self, **kwargs): + if "timeout" not in kwargs: + kwargs["timeout"] = 600 + return anthropic.AsyncAnthropic( + base_url=self.url_for(), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs + ) + class RemoteOpenAIServerCustom(RemoteOpenAIServer): """Launch test server with custom child process""" @@ -293,131 +310,6 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer): self.proc.kill() -class RemoteAnthropicServer: - DUMMY_API_KEY = "token-abc123" # vLLM's Anthropic server does not need API key - - def __init__( - self, - model: str, - vllm_serve_args: list[str], - *, - env_dict: dict[str, str] | None = None, - seed: int | None = 0, - auto_port: bool = True, - max_wait_seconds: float | None = None, - ) -> None: - if auto_port: - if "-p" in vllm_serve_args or "--port" in vllm_serve_args: - raise ValueError( - "You have manually specified the port when `auto_port=True`." - ) - - # Don't mutate the input args - vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())] - if seed is not None: - if "--seed" in vllm_serve_args: - raise ValueError( - f"You have manually specified the seed when `seed={seed}`." - ) - - vllm_serve_args = vllm_serve_args + ["--seed", str(seed)] - - parser = FlexibleArgumentParser(description="vLLM's remote Anthropic server.") - subparsers = parser.add_subparsers(required=False, dest="subparser") - parser = ServeSubcommand().subparser_init(subparsers) - args = parser.parse_args(["--model", model, *vllm_serve_args]) - self.host = str(args.host or "localhost") - self.port = int(args.port) - - self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None - - # download the model before starting the server to avoid timeout - is_local = os.path.isdir(model) - if not is_local: - engine_args = AsyncEngineArgs.from_cli_args(args) - model_config = engine_args.create_model_config() - load_config = engine_args.create_load_config() - - model_loader = get_model_loader(load_config) - model_loader.download_model(model_config) - - env = os.environ.copy() - # the current process might initialize cuda, - # to be safe, we should use spawn method - env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - if env_dict is not None: - env.update(env_dict) - self.proc = subprocess.Popen( - [ - sys.executable, - "-m", - "vllm.entrypoints.anthropic.api_server", - model, - *vllm_serve_args, - ], - env=env, - stdout=sys.stdout, - stderr=sys.stderr, - ) - max_wait_seconds = max_wait_seconds or 240 - self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.proc.terminate() - try: - self.proc.wait(8) - except subprocess.TimeoutExpired: - # force kill if needed - self.proc.kill() - - def _wait_for_server(self, *, url: str, timeout: float): - # run health check - start = time.time() - while True: - try: - if requests.get(url).status_code == 200: - break - except Exception: - # this exception can only be raised by requests.get, - # which means the server is not ready yet. - # the stack trace is not useful, so we suppress it - # by using `raise from None`. - result = self.proc.poll() - if result is not None and result != 0: - raise RuntimeError("Server exited unexpectedly.") from None - - time.sleep(0.5) - if time.time() - start > timeout: - raise RuntimeError("Server failed to start in time.") from None - - @property - def url_root(self) -> str: - return f"http://{self.host}:{self.port}" - - def url_for(self, *parts: str) -> str: - return self.url_root + "/" + "/".join(parts) - - def get_client(self, **kwargs): - if "timeout" not in kwargs: - kwargs["timeout"] = 600 - return anthropic.Anthropic( - base_url=self.url_for(), - api_key=self.DUMMY_API_KEY, - max_retries=0, - **kwargs, - ) - - def get_async_client(self, **kwargs): - if "timeout" not in kwargs: - kwargs["timeout"] = 600 - return anthropic.AsyncAnthropic( - base_url=self.url_for(), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs - ) - - def _test_completion( client: openai.OpenAI, model: str, diff --git a/vllm/entrypoints/anthropic/api_server.py b/vllm/entrypoints/anthropic/api_server.py deleted file mode 100644 index df877f99b084f..0000000000000 --- a/vllm/entrypoints/anthropic/api_server.py +++ /dev/null @@ -1,301 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# Adapted from: -# https://github.com/vllm/vllm/entrypoints/openai/api_server.py - -import asyncio -import signal -import tempfile -from argparse import Namespace -from http import HTTPStatus - -import uvloop -from fastapi import APIRouter, Depends, FastAPI, Request -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, Response, StreamingResponse -from starlette.datastructures import State - -import vllm.envs as envs -from vllm.engine.protocol import EngineClient -from vllm.entrypoints.anthropic.protocol import ( - AnthropicErrorResponse, - AnthropicMessagesRequest, - AnthropicMessagesResponse, -) -from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages -from vllm.entrypoints.launcher import serve_http -from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.api_server import ( - build_async_engine_client, - create_server_socket, - lifespan, - load_log_config, - validate_api_server_args, - validate_json_request, -) -from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args -from vllm.entrypoints.openai.protocol import ErrorResponse -from vllm.entrypoints.openai.serving_models import ( - BaseModelPath, - OpenAIServingModels, -) - -# -# yapf: enable -from vllm.entrypoints.openai.tool_parsers import ToolParserManager -from vllm.entrypoints.utils import ( - cli_env_setup, - load_aware_call, - process_chat_template, - process_lora_modules, - with_cancellation, -) -from vllm.logger import init_logger -from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm.utils.network_utils import is_valid_ipv6_address -from vllm.utils.system_utils import set_ulimit -from vllm.version import __version__ as VLLM_VERSION - -prometheus_multiproc_dir: tempfile.TemporaryDirectory - -# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) -logger = init_logger("vllm.entrypoints.anthropic.api_server") - -_running_tasks: set[asyncio.Task] = set() - -router = APIRouter() - - -def messages(request: Request) -> AnthropicServingMessages: - return request.app.state.anthropic_serving_messages - - -def engine_client(request: Request) -> EngineClient: - return request.app.state.engine_client - - -@router.get("/health", response_class=Response) -async def health(raw_request: Request) -> Response: - """Health check.""" - await engine_client(raw_request).check_health() - return Response(status_code=200) - - -@router.get("/ping", response_class=Response) -@router.post("/ping", response_class=Response) -async def ping(raw_request: Request) -> Response: - """Ping check. Endpoint required for SageMaker""" - return await health(raw_request) - - -@router.post( - "/v1/messages", - dependencies=[Depends(validate_json_request)], - responses={ - HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, - HTTPStatus.BAD_REQUEST.value: {"model": AnthropicErrorResponse}, - HTTPStatus.NOT_FOUND.value: {"model": AnthropicErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": AnthropicErrorResponse}, - }, -) -@with_cancellation -@load_aware_call -async def create_messages(request: AnthropicMessagesRequest, raw_request: Request): - handler = messages(raw_request) - if handler is None: - return messages(raw_request).create_error_response( - message="The model does not support Messages API" - ) - - generator = await handler.create_messages(request, raw_request) - - if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump()) - - elif isinstance(generator, AnthropicMessagesResponse): - logger.debug( - "Anthropic Messages Response: %s", generator.model_dump(exclude_none=True) - ) - return JSONResponse(content=generator.model_dump(exclude_none=True)) - - return StreamingResponse(content=generator, media_type="text/event-stream") - - -async def init_app_state( - engine_client: EngineClient, - state: State, - args: Namespace, -) -> None: - vllm_config = engine_client.vllm_config - - if args.served_model_name is not None: - served_model_names = args.served_model_name - else: - served_model_names = [args.model] - - if args.disable_log_requests: - request_logger = None - else: - request_logger = RequestLogger(max_log_len=args.max_log_len) - - base_model_paths = [ - BaseModelPath(name=name, model_path=args.model) for name in served_model_names - ] - - state.engine_client = engine_client - state.log_stats = not args.disable_log_stats - state.vllm_config = vllm_config - model_config = vllm_config.model_config - - default_mm_loras = ( - vllm_config.lora_config.default_mm_loras - if vllm_config.lora_config is not None - else {} - ) - lora_modules = process_lora_modules(args.lora_modules, default_mm_loras) - - resolved_chat_template = await process_chat_template( - args.chat_template, engine_client, model_config - ) - - state.openai_serving_models = OpenAIServingModels( - engine_client=engine_client, - base_model_paths=base_model_paths, - lora_modules=lora_modules, - ) - await state.openai_serving_models.init_static_loras() - state.anthropic_serving_messages = AnthropicServingMessages( - engine_client, - state.openai_serving_models, - args.response_role, - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, - return_tokens_as_token_ids=args.return_tokens_as_token_ids, - enable_auto_tools=args.enable_auto_tool_choice, - tool_parser=args.tool_call_parser, - reasoning_parser=args.reasoning_parser, - enable_prompt_tokens_details=args.enable_prompt_tokens_details, - enable_force_include_usage=args.enable_force_include_usage, - ) - - -def setup_server(args): - """Validate API server args, set up signal handler, create socket - ready to serve.""" - - logger.info("vLLM API server version %s", VLLM_VERSION) - - if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: - ToolParserManager.import_tool_parser(args.tool_parser_plugin) - - validate_api_server_args(args) - - # workaround to make sure that we bind the port before the engine is set up. - # This avoids race conditions with ray. - # see https://github.com/vllm-project/vllm/issues/8204 - sock_addr = (args.host or "", args.port) - sock = create_server_socket(sock_addr) - - # workaround to avoid footguns where uvicorn drops requests with too - # many concurrent requests active - set_ulimit() - - def signal_handler(*_) -> None: - # Interrupt server on sigterm while initializing - raise KeyboardInterrupt("terminated") - - signal.signal(signal.SIGTERM, signal_handler) - - addr, port = sock_addr - is_ssl = args.ssl_keyfile and args.ssl_certfile - host_part = f"[{addr}]" if is_valid_ipv6_address(addr) else addr or "0.0.0.0" - listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}" - - return listen_address, sock - - -async def run_server(args, **uvicorn_kwargs) -> None: - """Run a single-worker API server.""" - listen_address, sock = setup_server(args) - await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) - - -def build_app(args: Namespace) -> FastAPI: - app = FastAPI(lifespan=lifespan) - app.include_router(router) - app.root_path = args.root_path - - app.add_middleware( - CORSMiddleware, - allow_origins=args.allowed_origins, - allow_credentials=args.allow_credentials, - allow_methods=args.allowed_methods, - allow_headers=args.allowed_headers, - ) - - return app - - -async def run_server_worker( - listen_address, sock, args, client_config=None, **uvicorn_kwargs -) -> None: - """Run a single API server worker.""" - - if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: - ToolParserManager.import_tool_parser(args.tool_parser_plugin) - - server_index = client_config.get("client_index", 0) if client_config else 0 - - # Load logging config for uvicorn if specified - log_config = load_log_config(args.log_config_file) - if log_config is not None: - uvicorn_kwargs["log_config"] = log_config - - async with build_async_engine_client( - args, - client_config=client_config, - ) as engine_client: - app = build_app(args) - - await init_app_state(engine_client, app.state, args) - - logger.info("Starting vLLM API server %d on %s", server_index, listen_address) - shutdown_task = await serve_http( - app, - sock=sock, - enable_ssl_refresh=args.enable_ssl_refresh, - host=args.host, - port=args.port, - log_level=args.uvicorn_log_level, - # NOTE: When the 'disable_uvicorn_access_log' value is True, - # no access log will be output. - access_log=not args.disable_uvicorn_access_log, - timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE, - ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile, - ssl_ca_certs=args.ssl_ca_certs, - ssl_cert_reqs=args.ssl_cert_reqs, - **uvicorn_kwargs, - ) - - # NB: Await server shutdown only after the backend context is exited - try: - await shutdown_task - finally: - sock.close() - - -if __name__ == "__main__": - # NOTE(simon): - # This section should be in sync with vllm/entrypoints/cli/main.py for CLI - # entrypoints. - cli_env_setup() - parser = FlexibleArgumentParser( - description="vLLM Anthropic-Compatible RESTful API server." - ) - parser = make_arg_parser(parser) - args = parser.parse_args() - validate_parsed_serve_args(args) - - uvloop.run(run_server(args)) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 8fa71855f8f66..22b5584749ae7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -41,6 +41,13 @@ import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import Device, EngineClient +from vllm.entrypoints.anthropic.protocol import ( + AnthropicError, + AnthropicErrorResponse, + AnthropicMessagesRequest, + AnthropicMessagesResponse, +) +from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args @@ -308,6 +315,10 @@ def responses(request: Request) -> OpenAIServingResponses | None: return request.app.state.openai_serving_responses +def messages(request: Request) -> AnthropicServingMessages: + return request.app.state.anthropic_serving_messages + + def chat(request: Request) -> OpenAIServingChat | None: return request.app.state.openai_serving_chat @@ -591,6 +602,63 @@ async def cancel_responses(response_id: str, raw_request: Request): return JSONResponse(content=response.model_dump()) +@router.post( + "/v1/messages", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": AnthropicErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": AnthropicErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": AnthropicErrorResponse}, + }, +) +@with_cancellation +@load_aware_call +async def create_messages(request: AnthropicMessagesRequest, raw_request: Request): + def translate_error_response(response: ErrorResponse) -> JSONResponse: + anthropic_error = AnthropicErrorResponse( + error=AnthropicError( + type=response.error.type, + message=response.error.message, + ) + ) + return JSONResponse( + status_code=response.error.code, content=anthropic_error.model_dump() + ) + + handler = messages(raw_request) + if handler is None: + error = base(raw_request).create_error_response( + message="The model does not support Messages API" + ) + return translate_error_response(error) + + try: + generator = await handler.create_messages(request, raw_request) + except Exception as e: + logger.exception("Error in create_messages: %s", e) + return JSONResponse( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + content=AnthropicErrorResponse( + error=AnthropicError( + type="internal_error", + message=str(e), + ) + ).model_dump(), + ) + + if isinstance(generator, ErrorResponse): + return translate_error_response(generator) + + elif isinstance(generator, AnthropicMessagesResponse): + logger.debug( + "Anthropic Messages Response: %s", generator.model_dump(exclude_none=True) + ) + return JSONResponse(content=generator.model_dump(exclude_none=True)) + + return StreamingResponse(content=generator, media_type="text/event-stream") + + @router.post( "/v1/chat/completions", dependencies=[Depends(validate_json_request)], @@ -1817,6 +1885,24 @@ async def init_app_state( if "transcription" in supported_tasks else None ) + state.anthropic_serving_messages = ( + AnthropicServingMessages( + engine_client, + state.openai_serving_models, + args.response_role, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + enable_auto_tools=args.enable_auto_tool_choice, + tool_parser=args.tool_call_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, + ) + if "generate" in supported_tasks + else None + ) state.enable_server_load_tracking = args.enable_server_load_tracking state.server_load_metrics = 0 From 685c99ee77b4818dcdd15b30fe0e0eff0d5d22ec Mon Sep 17 00:00:00 2001 From: Yue Zhang <81500899+KevinCheung2259@users.noreply.github.com> Date: Sun, 2 Nov 2025 05:08:56 +0800 Subject: [PATCH 077/976] [KV offload] Offloading connector async scheduling support (#27648) Signed-off-by: KevinCheung2259 <2651309292@qq.com> Co-authored-by: Nick Hill --- .../kv_transfer/kv_connector/v1/offloading_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 19344e5784c23..7567c7fae5789 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -274,8 +274,8 @@ class OffloadingConnectorScheduler: if num_new_blocks <= 0: continue - num_gpu_blocks = num_blocks * self.block_size_factor - assert len(req.block_hashes) >= num_gpu_blocks + # NOTE: In async scheduling, placeholders may temporarily make + # len(req.block_hashes) < num_blocks * self.block_size_factor. new_block_hashes = self._get_block_hashes( req, start_idx=start_block_idx, end_idx=num_blocks From 758ea2e980a1eeacec6097bfd98bd0a7c8fb864a Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Sat, 1 Nov 2025 23:45:02 -0400 Subject: [PATCH 078/976] [CI/Build] Fix flaky test_transcription_validation.py::test_basic_audio_gemma (#27924) Signed-off-by: Ben Browning --- tests/entrypoints/openai/test_transcription_validation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 6ef932392d095..f6133d4387b26 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -72,7 +72,9 @@ async def test_basic_audio_gemma(foscolo): model_name = "google/gemma-3n-E2B-it" server_args = ["--enforce-eager"] - with RemoteOpenAIServer(model_name, server_args) as remote_server: + with RemoteOpenAIServer( + model_name, server_args, max_wait_seconds=480 + ) as remote_server: client = remote_server.get_async_client() transcription = await client.audio.transcriptions.create( model=model_name, From 853a8eb53b89f9f3468ab553e86a964cb4e6cd1e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 2 Nov 2025 13:06:05 +0800 Subject: [PATCH 079/976] [Bugfix] Fix Qwen Omni audio inference (#27920) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/qwen2_5_omni_thinker.py | 9 ++------- vllm/model_executor/models/qwen3_omni_moe_thinker.py | 3 --- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 677d34dea39b3..7e970ebbe2bbc 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -130,6 +130,8 @@ class Qwen2_5OmniAudioFeatureInputs(TensorSchema): TensorShape("nmb", "tsl", dynamic_dims={"tsl"}), ] + audio_feature_lengths: Annotated[torch.Tensor, TensorShape("na")] + feature_attention_mask: Annotated[ torch.Tensor | list[torch.Tensor], TensorShape("na", "msl", dynamic_dims={"msl"}), @@ -732,13 +734,6 @@ class Qwen2_5OmniConditionalGenerationMixin: input_features = audio_input["input_features"] audio_feature_lengths = audio_input["audio_feature_lengths"] - if audio_feature_lengths.shape[0] == 1: - audio_feature_lengths = audio_feature_lengths.squeeze(0) - elif audio_feature_lengths.shape[1] == 1: - audio_feature_lengths = audio_feature_lengths.squeeze(1) - else: - raise AssertionError(audio_feature_lengths.shape) - audio_feat_lengths, audio_output_lengths = ( self.audio_tower._get_feat_extract_output_lengths(audio_feature_lengths) ) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index efcd003fbbda7..f20e679027214 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -99,7 +99,6 @@ from .utils import ( AutoWeightsLoader, WeightsMapper, _merge_multimodal_embeddings, - flatten_bn, maybe_prefix, ) from .vision import ( @@ -1065,8 +1064,6 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix input_features = audio_input["input_features"] audio_feature_lengths = audio_input["audio_feature_lengths"] - audio_feature_lengths = flatten_bn(audio_feature_lengths, concat=True) - audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths( audio_feature_lengths ) From 73444b7b5623f5bc569277c8c7dc809843312d11 Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Sun, 2 Nov 2025 09:48:33 +0100 Subject: [PATCH 080/976] Performance fix MistralTokenizer: cache special ids and tokens (#27925) Signed-off-by: Julien Denize Co-authored-by: Patrick von Platen --- vllm/transformers_utils/tokenizers/mistral.py | 66 +++++++++---------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 6f710bf23360f..7033523224c51 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -191,6 +191,12 @@ class MistralTokenizer(TokenizerBase): # Sort the dict for convenience self._vocab_dict = dict(sorted(self._vocab_dict.items(), key=lambda x: x[1])) + # Cache special tokens for faster access. + self._special_token_ids = self._get_special_token_ids() + self._special_token_ids_set = set(self._special_token_ids) + self._special_tokens = self._get_special_tokens(self._special_token_ids) + self._special_tokens_set = set(self._special_tokens) + # Vocab sorted by token id. self._vocab = self.tokenizer._vocab self._max_token_id = self.vocab_size - 1 @@ -210,23 +216,7 @@ class MistralTokenizer(TokenizerBase): ) ) - # the following attributes are set to fit vLLM's design and are used - # by the structured output backends. - @property - def all_special_tokens_extended(self) -> list[str]: - return self.all_special_tokens - - @property - def all_special_tokens(self) -> list[str]: - from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy - - return [ - self.tokenizer.decode([i], special_token_policy=SpecialTokenPolicy.KEEP) - for i in self.all_special_ids - ] - - @property - def all_special_ids(self) -> list[int]: + def _get_special_token_ids(self) -> list[int]: from mistral_common.tokens.tokenizers.sentencepiece import ( SentencePieceTokenizer, ) @@ -244,6 +234,28 @@ class MistralTokenizer(TokenizerBase): raise ValueError(f"Unknown tokenizer type: {type(self.tokenizer)}") return sorted(special_ids) + def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]: + from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy + + return [ + self.tokenizer.decode([i], special_token_policy=SpecialTokenPolicy.KEEP) + for i in all_special_ids + ] + + # the following attributes are set to fit vLLM's design and are used + # by the structured output backends. + @property + def all_special_tokens_extended(self) -> list[str]: + return self.all_special_tokens + + @property + def all_special_tokens(self) -> list[str]: + return self._special_tokens + + @property + def all_special_ids(self) -> list[int]: + return self._special_token_ids + @property def bos_token_id(self) -> int: return self.tokenizer.bos_id @@ -277,21 +289,7 @@ class MistralTokenizer(TokenizerBase): raise NotImplementedError() def _is_special_token_id(self, token_id: int) -> bool: - from mistral_common.tokens.tokenizers.sentencepiece import ( - SentencePieceTokenizer, - ) - from mistral_common.tokens.tokenizers.tekken import Tekkenizer - - if self.is_spm: - assert isinstance(self.tokenizer, SentencePieceTokenizer), type( - self.tokenizer - ) - return token_id in self.tokenizer._control_tokens - if self.is_tekken: - assert isinstance(self.tokenizer, Tekkenizer), type(self.tokenizer) - return token_id < self.tokenizer.num_special_tokens - else: - raise ValueError(f"Unknown tokenizer type: {type(self.tokenizer)}") + return token_id in self._special_token_ids_set def __len__(self) -> int: return self.vocab_size @@ -405,7 +403,7 @@ class MistralTokenizer(TokenizerBase): tokens = [ t for t in tokens - if (t in to_decode_special_tokens or t not in self.all_special_tokens) + if (t in to_decode_special_tokens or t not in self._special_tokens_set) ] if any(isinstance(t, bytes) for t in tokens): @@ -489,7 +487,7 @@ class MistralTokenizer(TokenizerBase): # We filtered unwanted special tokens so we can decode the rest. tokens = [ self.tokenizer.id_to_byte_piece(token_id, SpecialTokenPolicy.KEEP) - if token_id not in self.all_special_ids + if token_id not in self._special_token_ids_set else self.tokenizer.decode([token_id], SpecialTokenPolicy.KEEP) for token_id in ids_kept ] From 00b31a36a2d0de6d197a473280b2304d482714af Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Sun, 2 Nov 2025 14:16:23 +0200 Subject: [PATCH 081/976] [V1] [Hybrid] Mamba1 Automatic Prefix Caching (#26377) Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> --- csrc/mamba/mamba_ssm/selective_scan.h | 8 +- csrc/mamba/mamba_ssm/selective_scan_fwd.cu | 134 +++++++++++++++--- csrc/ops.h | 24 ++-- csrc/torch_bindings.cpp | 6 +- tests/kernels/mamba/test_mamba_ssm.py | 15 ++ .../models/language/generation/test_hybrid.py | 34 ++--- vllm/_custom_ops.py | 8 ++ vllm/config/model.py | 6 + .../layers/mamba/mamba_mixer.py | 91 ++++++++---- .../layers/mamba/ops/mamba_ssm.py | 24 +++- vllm/model_executor/models/config.py | 2 +- vllm/model_executor/models/jamba.py | 21 ++- vllm/model_executor/models/mamba.py | 9 +- vllm/v1/attention/backends/mamba1_attn.py | 111 ++++++++++++--- vllm/v1/attention/backends/mamba2_attn.py | 40 +----- vllm/v1/attention/backends/mamba_attn.py | 62 +++++++- 16 files changed, 442 insertions(+), 153 deletions(-) diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h index 13c6178941cf8..7d22dd8b84a39 100644 --- a/csrc/mamba/mamba_ssm/selective_scan.h +++ b/csrc/mamba/mamba_ssm/selective_scan.h @@ -24,6 +24,8 @@ struct SSMParamsBase { int64_t pad_slot_id; bool delta_softplus; + bool cache_enabled; + int block_size; index_t A_d_stride; index_t A_dstate_stride; @@ -46,8 +48,9 @@ struct SSMParamsBase { index_t out_z_batch_stride; index_t out_z_d_stride; index_t ssm_states_batch_stride; - index_t ssm_states_dim_stride; + index_t ssm_states_dim_stride; index_t ssm_states_dstate_stride; + index_t cache_indices_stride; // Common data pointers. void *__restrict__ A_ptr; @@ -66,6 +69,9 @@ struct SSMParamsBase { void *__restrict__ cache_indices_ptr; void *__restrict__ has_initial_state_ptr; + void *__restrict__ block_idx_first_scheduled_token_ptr; // (batch,) - first block to write + void *__restrict__ block_idx_last_scheduled_token_ptr; // (batch,) - last block to write + void *__restrict__ initial_state_idx_ptr; // (batch,) - index of the initial state to use }; diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index d534e138d26d6..fb2a2e5789999 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -119,7 +119,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr : reinterpret_cast(params.cache_indices_ptr); - const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id]; + const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id]; // cache_index == params.pad_slot_id is defined as padding, so we exit early if (cache_index == params.pad_slot_id){ return; @@ -133,9 +133,18 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { input_t *Bvar = reinterpret_cast(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride; weight_t *C = reinterpret_cast(params.C_ptr) + dim_id * kNRows * params.C_d_stride; input_t *Cvar = reinterpret_cast(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride; - typename Ktraits::state_t *ssm_states = reinterpret_cast(params.ssm_states_ptr) + - cache_index * params.ssm_states_batch_stride + - dim_id * kNRows * params.ssm_states_dim_stride; + + typename Ktraits::state_t *ssm_states; + if (params.cache_enabled) { + // APC mode: ssm_states points to the base, we'll use absolute cache slots later + ssm_states = reinterpret_cast(params.ssm_states_ptr) + + dim_id * kNRows * params.ssm_states_dim_stride; + } else { + // Non-APC mode: offset by cache_index as before + ssm_states = reinterpret_cast(params.ssm_states_ptr) + + cache_index * params.ssm_states_batch_stride + + dim_id * kNRows * params.ssm_states_dim_stride; + } float D_val[kNRows] = {0}; if (params.D_ptr != nullptr) { @@ -159,7 +168,22 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { // } constexpr int kChunkSize = kNThreads * kNItems; - const int n_chunks = (seqlen + 2048 - 1) / 2048; + + // Use block_size for chunking when APC is enabled, otherwise use 2048 for backwards compatibility + const int iteration_chunk_size = params.cache_enabled ? params.block_size : 2048; + const int n_chunks = (seqlen + iteration_chunk_size - 1) / iteration_chunk_size; + + const int* batch_cache_indices = cache_indices != nullptr ? + cache_indices + batch_id * params.cache_indices_stride : nullptr; + const int* block_idx_first_scheduled = params.block_idx_first_scheduled_token_ptr != nullptr ? + reinterpret_cast(params.block_idx_first_scheduled_token_ptr) : nullptr; + const int* block_idx_last_scheduled = params.block_idx_last_scheduled_token_ptr != nullptr ? + reinterpret_cast(params.block_idx_last_scheduled_token_ptr) : nullptr; + const int* initial_state_idx = params.initial_state_idx_ptr != nullptr ? + reinterpret_cast(params.initial_state_idx_ptr) : nullptr; + + const size_t load_cache_slot = params.cache_enabled && batch_cache_indices != nullptr ? batch_cache_indices[initial_state_idx[batch_id]] : cache_index; + for (int chunk = 0; chunk < n_chunks; ++chunk) { input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems]; @@ -219,7 +243,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { if constexpr (kIsVariableC) { auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1; load_weight(Cvar + state_idx * params.C_dstate_stride, C_vals, - smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 )); + smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1)); if constexpr (!kIsVariableB) { #pragma unroll for (int r = 0; r < kNRows; ++r) { @@ -242,7 +266,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { for (int i = 0; i < kNItems; ++i) { thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]), !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]); - if (seqlen % (kNItems * kNThreads) != 0) { // So that the last state is correct if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) { thread_data[i] = make_float2(1.f, 0.f); @@ -250,8 +273,24 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { } } // Initialize running total - - scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx * params.ssm_states_dstate_stride]): 0.0); + scan_t running_prefix; + if (chunk > 0) { + running_prefix = smem_running_prefix[state_idx + r * MAX_DSTATE]; + } else { + // Load initial state + if (params.cache_enabled && has_initial_state && batch_cache_indices != nullptr) { + size_t state_offset = load_cache_slot * params.ssm_states_batch_stride + + r * params.ssm_states_dim_stride + + state_idx * params.ssm_states_dstate_stride; + running_prefix = make_float2(1.0, float(ssm_states[state_offset])); + } else if (has_initial_state) { + // Non-APC mode: load from current batch position + running_prefix = make_float2(1.0, float(ssm_states[state_idx * params.ssm_states_dstate_stride])); + } else { + // No initial state + running_prefix = make_float2(1.0, 0.0); + } + } SSMScanPrefixCallbackOp prefix_op(running_prefix); typename Ktraits::BlockScanT(smem_scan).InclusiveScan( @@ -260,8 +299,25 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { // There's a syncthreads in the scan op, so we don't need to sync here. // Unless there's only 1 warp, but then it's the same thread (0) reading and writing. if (threadIdx.x == 0) { - smem_running_prefix[state_idx] = prefix_op.running_prefix; - if (chunk == n_chunks - 1) { + smem_running_prefix[state_idx + r * MAX_DSTATE] = prefix_op.running_prefix; + + // Store state at the end of each chunk when cache is enabled + if (params.cache_enabled && batch_cache_indices != nullptr) { + + size_t cache_slot; + if (chunk == n_chunks - 1) { + cache_slot = batch_cache_indices[block_idx_last_scheduled[batch_id]]; + } else { + cache_slot = batch_cache_indices[block_idx_first_scheduled[batch_id] + chunk]; + } + + size_t state_offset = cache_slot * params.ssm_states_batch_stride + + r * params.ssm_states_dim_stride + + state_idx * params.ssm_states_dstate_stride; + + ssm_states[state_offset] = typename Ktraits::state_t(prefix_op.running_prefix.y); + } else if (!params.cache_enabled && chunk == n_chunks - 1) { + // Non-APC mode: store only final state at current batch position ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y); } } @@ -274,7 +330,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { } } } - input_t *out = reinterpret_cast(params.out_ptr) + sequence_start_index * params.out_batch_stride + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize; __syncthreads(); @@ -346,7 +401,9 @@ template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream) { #ifndef USE_ROCM - if (params.seqlen <= 128) { + if (params.cache_enabled && params.block_size == 1024) { + selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream); + } else if (params.seqlen <= 128) { selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 256) { selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream); @@ -358,7 +415,9 @@ void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream) { selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream); } #else - if (params.seqlen <= 256) { + if (params.cache_enabled && params.block_size == 1024) { + selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream); + } else if (params.seqlen <= 256) { selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 512) { selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream); @@ -437,13 +496,17 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, const std::optional& D, const std::optional& delta_bias, const torch::Tensor ssm_states, - bool has_z, + bool has_z, bool delta_softplus, const std::optional& query_start_loc, const std::optional& cache_indices, const std::optional& has_initial_state, bool varlen, - int64_t pad_slot_id) { + int64_t pad_slot_id, + int64_t block_size, + const std::optional &block_idx_first_scheduled_token, + const std::optional &block_idx_last_scheduled_token, + const std::optional &initial_state_idx) { // Reset the parameters memset(¶ms, 0, sizeof(params)); @@ -477,6 +540,14 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr; params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr; + // Set cache parameters - cache is enabled if we have direct cache writing params + params.cache_enabled = block_idx_first_scheduled_token.has_value(); + params.block_size = static_cast(block_size); + + // Set direct cache writing pointers + params.block_idx_first_scheduled_token_ptr = block_idx_first_scheduled_token.has_value() ? block_idx_first_scheduled_token.value().data_ptr() : nullptr; + params.block_idx_last_scheduled_token_ptr = block_idx_last_scheduled_token.has_value() ? block_idx_last_scheduled_token.value().data_ptr() : nullptr; + params.initial_state_idx_ptr = initial_state_idx.has_value() ? initial_state_idx.value().data_ptr() : nullptr; // All stride are in elements, not bytes. params.A_d_stride = A.stride(0); @@ -504,9 +575,11 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, params.out_d_stride = out.stride(0); params.ssm_states_batch_stride = ssm_states.stride(0); - params.ssm_states_dim_stride = ssm_states.stride(1); + params.ssm_states_dim_stride = ssm_states.stride(1); params.ssm_states_dstate_stride = ssm_states.stride(2); + params.cache_indices_stride = cache_indices.has_value() ? cache_indices.value().stride(0) : 0; + } else{ if (!is_variable_B) { @@ -537,8 +610,10 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, params.out_d_stride = out.stride(1); params.ssm_states_batch_stride = ssm_states.stride(0); - params.ssm_states_dim_stride = ssm_states.stride(1); + params.ssm_states_dim_stride = ssm_states.stride(1); params.ssm_states_dstate_stride = ssm_states.stride(2); + + params.cache_indices_stride = cache_indices.has_value() ? cache_indices.value().stride(0) : 0; } } @@ -554,7 +629,11 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, const torch::Tensor &ssm_states, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early - int64_t pad_slot_id) { + int64_t pad_slot_id, + int64_t block_size, + const std::optional &block_idx_first_scheduled_token, + const std::optional &block_idx_last_scheduled_token, + const std::optional &initial_state_idx) { auto input_type = u.scalar_type(); auto weight_type = A.scalar_type(); TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16); @@ -646,7 +725,16 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, auto cache_indices_ = cache_indices.value(); TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int); TORCH_CHECK(cache_indices_.is_cuda()); - CHECK_SHAPE(cache_indices_, batch_size); + + // cache_indices can be either 1D (batch_size,) for non-APC mode + // or 2D (batch_size, max_positions) for APC mode + const bool is_apc_mode = block_idx_first_scheduled_token.has_value(); + if (is_apc_mode) { + TORCH_CHECK(cache_indices_.dim() == 2, "cache_indices must be 2D for APC mode"); + TORCH_CHECK(cache_indices_.size(0) == batch_size, "cache_indices first dimension must match batch_size"); + } else { + CHECK_SHAPE(cache_indices_, batch_size); + } } @@ -686,7 +774,11 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, cache_indices, has_initial_state, varlen, - pad_slot_id + pad_slot_id, + block_size, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + initial_state_idx ); diff --git a/csrc/ops.h b/csrc/ops.h index 0bed7492f6616..3f5cb799b774c 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -321,17 +321,19 @@ void dynamic_per_token_scaled_fp8_quant( torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale, std::optional const& scale_ub); -void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta, - const torch::Tensor& A, const torch::Tensor& B, - const torch::Tensor& C, - const std::optional& D_, - const std::optional& z_, - const std::optional& delta_bias_, - bool delta_softplus, - const std::optional& query_start_loc, - const std::optional& cache_indices, - const std::optional& has_initial_state, - const torch::Tensor& ssm_states, int64_t pad_slot_id); +void selective_scan_fwd( + const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A, + const torch::Tensor& B, const torch::Tensor& C, + const std::optional& D_, + const std::optional& z_, + const std::optional& delta_bias_, bool delta_softplus, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, + const torch::Tensor& ssm_states, int64_t pad_slot_id, int64_t block_size, + const std::optional& block_idx_first_scheduled_token, + const std::optional& block_idx_last_scheduled_token, + const std::optional& initial_state_idx); torch::Tensor dynamic_4bit_int_moe_cpu( torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 8f091a429fbef..9c0f524dcab11 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -611,7 +611,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "Tensor? cache_indices," "Tensor? has_initial_state," "Tensor! ssm_states," - "int pad_slot_id) -> ()"); + "int pad_slot_id," + "int block_size," + "Tensor? block_idx_first_scheduled_token," + "Tensor? block_idx_last_scheduled_token," + "Tensor? initial_state_idx) -> ()"); ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); // Hadamard transforms diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py index c59fc7af0c897..98edc959957d0 100644 --- a/tests/kernels/mamba/test_mamba_ssm.py +++ b/tests/kernels/mamba/test_mamba_ssm.py @@ -179,6 +179,10 @@ def selective_scan_opcheck_fn( has_initial_state=None, ssm_states=None, pad_slot_id=PAD_SLOT_ID, + block_size=2048, + block_idx_first_scheduled_token=None, + block_idx_last_scheduled_token=None, + initial_state_idx=None, ): """if return_last_state is True, returns (out, last_state) last_state has shape (batch, dim, dstate). @@ -223,6 +227,10 @@ def selective_scan_opcheck_fn( has_initial_state, ssm_states, pad_slot_id, + block_size, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + initial_state_idx, ), test_utils=["test_schema", "test_faketensor"], ) @@ -338,6 +346,11 @@ def test_selective_scan( has_initial_state=torch.ones(batch_size, device=u.device, dtype=torch.bool) if c > 0 else None, + pad_slot_id=PAD_SLOT_ID, + block_size=2048, + block_idx_first_scheduled_token=None, + block_idx_last_scheduled_token=None, + initial_state_idx=None, ) outs.append(out) if len(outs) > 1: @@ -372,6 +385,7 @@ def test_selective_scan( delta_bias=delta_bias, delta_softplus=delta_softplus, ssm_states=state, + block_size=2048, ) @@ -586,6 +600,7 @@ def test_selective_scan_varlen( padded_state_indices, has_initial_state, prev_state, + block_size=2048, ) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index fd2df329f17f9..681b380e6a155 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -19,6 +19,8 @@ pytestmark = pytest.mark.hybrid_model # meaning that it will be used in all tests in this file # The rest of the models will only be tested by test_models +APC_MULTIPLY_BY = 300 + SSM_MODELS = [ "state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev", @@ -380,7 +382,7 @@ def _get_vLLM_output( return outs, vllm_model -@pytest.mark.parametrize("model", [HYBRID_MODELS[3]]) +@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("n_repetitions", [2]) # If num_logprobs is set to -1, then the stringent version @@ -410,10 +412,8 @@ def test_apc_single_prompt( check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore ) - MULTIPLE = 300 - # Sample prompts. - generated_prompts = [MULTIPLE * example_prompts[0]] + generated_prompts = [APC_MULTIPLY_BY * example_prompts[0]] max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts) vllm_runner_kwargs = _get_vllm_runner_params( @@ -446,7 +446,7 @@ def test_apc_single_prompt( ) -@pytest.mark.parametrize("model", [HYBRID_MODELS[3]]) +@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("n_repetitions", [2]) # If num_logprobs is set to -1, then the stringent version @@ -476,10 +476,8 @@ def test_apc_single_prompt_block_align_alignment( check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore ) - MULTIPLE = 300 - # Sample prompts. This custom prompt is used, as it causes the most issues - generated_prompts = ["The president of the United States is " * MULTIPLE] + generated_prompts = ["The president of the United States is " * APC_MULTIPLY_BY] max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts) vllm_runner_kwargs = _get_vllm_runner_params( @@ -528,7 +526,7 @@ def test_apc_single_prompt_block_align_alignment( ) -@pytest.mark.parametrize("model", [HYBRID_MODELS[3]]) +@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("n_repetitions", [2]) # If num_logprobs is set to -1, then the stringent version @@ -558,10 +556,8 @@ def test_apc_multiple_prompts_all_cached_outputs( check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore ) - MULTIPLE = 300 - # Sample prompts. - generated_prompts = [MULTIPLE * prompt for prompt in example_prompts] + generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts] max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts) vllm_runner_kwargs = _get_vllm_runner_params( @@ -595,7 +591,7 @@ def test_apc_multiple_prompts_all_cached_outputs( ) -@pytest.mark.parametrize("model", [HYBRID_MODELS[3]]) +@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("n_repetitions", [2]) # If num_logprobs is set to -1, then the stringent version @@ -625,12 +621,12 @@ def test_apc_multiple_prompts_block_align_alignment( check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore ) - MULTIPLE = 300 - # Sample prompts. This custom prompt is used, as it causes the most issues prompt_text = "The president of the United States is " prompt_offsets = [0, 3, 7, 13, 17, 22, 25, 31] - generated_prompts = [prompt_text[offset:] * MULTIPLE for offset in prompt_offsets] + generated_prompts = [ + prompt_text[offset:] * APC_MULTIPLY_BY for offset in prompt_offsets + ] max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts) vllm_runner_kwargs = _get_vllm_runner_params( @@ -679,7 +675,7 @@ def test_apc_multiple_prompts_block_align_alignment( ) -@pytest.mark.parametrize("model", [HYBRID_MODELS[3]]) +@pytest.mark.parametrize("model", [HYBRID_MODELS[0], HYBRID_MODELS[3]]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("n_repetitions", [2]) # If num_logprobs is set to -1, then the stringent version @@ -709,10 +705,8 @@ def test_apc_multiple_prompts_partial_cached_outputs( check_logprobs_close if num_logprobs > 0 else check_outputs_equal # type: ignore ) - MULTIPLE = 300 - # Sample prompts. - generated_prompts = [MULTIPLE * prompt for prompt in example_prompts] + generated_prompts = [APC_MULTIPLY_BY * prompt for prompt in example_prompts] max_model_len = max(len(prompt) + max_tokens for prompt in generated_prompts) vllm_runner_kwargs = _get_vllm_runner_params( diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 9110b0573fc92..61cf54fcfa39a 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1719,6 +1719,10 @@ def selective_scan_fwd( has_initial_state: torch.Tensor | None, ssm_states: torch.Tensor, pad_slot_id: int, + block_size: int = 1024, + block_idx_first_scheduled_token: torch.Tensor | None = None, + block_idx_last_scheduled_token: torch.Tensor | None = None, + initial_state_idx: torch.Tensor | None = None, ): torch.ops._C.selective_scan_fwd( u, @@ -1735,6 +1739,10 @@ def selective_scan_fwd( has_initial_state, ssm_states, pad_slot_id, + block_size, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + initial_state_idx, ) diff --git a/vllm/config/model.py b/vllm/config/model.py index 082f90653f5af..2e80df4311035 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1483,6 +1483,12 @@ class ModelConfig: if chunk_size is None: # used by e.g. Mamba2, NemotronH, Zamba chunk_size = getattr(self.hf_text_config, "chunk_size", None) + + # Since Mamba1 does not have a chunk notion + # we use a default chunk size of 1024. + if chunk_size is None: + chunk_size = 2048 + return chunk_size def get_multimodal_config(self) -> MultiModalConfig: diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index a9a0c216474bc..b6345b8af7f0a 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -241,18 +241,21 @@ class MambaMixer(MambaBase, CustomOp): forward_context: ForwardContext = get_forward_context() attn_metadata = forward_context.attn_metadata + assert self.cache_config is not None + mamba_block_size = self.cache_config.mamba_block_size + prefix_caching_enabled = self.cache_config.enable_prefix_caching + if attn_metadata is not None: assert isinstance(attn_metadata, dict) attn_metadata = attn_metadata[self.prefix] - mamba1_metadata = attn_metadata - assert isinstance(mamba1_metadata, Mamba1AttentionMetadata) - query_start_loc = mamba1_metadata.query_start_loc - state_indices_tensor = mamba1_metadata.state_indices_tensor + assert isinstance(attn_metadata, Mamba1AttentionMetadata) + query_start_loc_p = attn_metadata.query_start_loc_p + state_indices_tensor = attn_metadata.state_indices_tensor self_kv_cache = self.kv_cache[forward_context.virtual_engine] conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] - has_initial_states = mamba1_metadata.has_initial_states - num_padded_decodes = mamba1_metadata.num_padded_decodes + has_initial_states_p = attn_metadata.has_initial_states_p + num_padded_decodes = attn_metadata.num_padded_decodes # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) @@ -279,12 +282,8 @@ class MambaMixer(MambaBase, CustomOp): hidden_states_BC, gate, state_indices_tensor, - query_start_loc, - has_initial_states, num_prefill_tokens, - num_decode_tokens, num_prefills, - num_decodes, num_padded_decodes, ) hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p @@ -293,8 +292,34 @@ class MambaMixer(MambaBase, CustomOp): gate_d = prefill_decode_split.gate_d state_indices_tensor_p = prefill_decode_split.state_indices_tensor_p state_indices_tensor_d = prefill_decode_split.state_indices_tensor_d - query_start_loc_p = prefill_decode_split.query_start_loc_p - has_initial_states_p = prefill_decode_split.has_initial_states_p + + if prefix_caching_enabled: + block_idx_last_computed_token_d, block_idx_last_computed_token_p = ( + torch.split( + attn_metadata.block_idx_last_computed_token, + [num_decodes, num_prefills], + dim=0, + ) + ) + block_idx_last_scheduled_token_d, block_idx_last_scheduled_token_p = ( + torch.split( + attn_metadata.block_idx_last_scheduled_token, + [num_decodes, num_prefills], + dim=0, + ) + ) + + block_idx_first_scheduled_token_p = ( + attn_metadata.block_idx_first_scheduled_token_p + ) + num_computed_tokens_p = attn_metadata.num_computed_tokens_p + else: + block_idx_last_computed_token_d = None + block_idx_last_computed_token_p = None + block_idx_last_scheduled_token_d = None + block_idx_last_scheduled_token_p = None + block_idx_first_scheduled_token_p = None + num_computed_tokens_p = None ssm_outputs = [] @@ -309,6 +334,11 @@ class MambaMixer(MambaBase, CustomOp): has_initial_state=has_initial_states_p, cache_indices=state_indices_tensor_p, query_start_loc=query_start_loc_p, + block_idx_first_scheduled_token=block_idx_first_scheduled_token_p, + block_idx_last_scheduled_token=block_idx_last_scheduled_token_p, + initial_state_idx=block_idx_last_computed_token_p, + num_computed_tokens=num_computed_tokens_p, + block_size_to_align=mamba_block_size, ) # 3. State Space Model sequence transformations. discrete_time_step_p, B_p, C_p = self._ssm_transform( @@ -331,10 +361,24 @@ class MambaMixer(MambaBase, CustomOp): cache_indices=state_indices_tensor_p, has_initial_state=has_initial_states_p, query_start_loc=query_start_loc_p, + block_size=mamba_block_size, + block_idx_first_scheduled_token=block_idx_first_scheduled_token_p, + block_idx_last_scheduled_token=block_idx_last_scheduled_token_p, + initial_state_idx=block_idx_last_computed_token_p, ) ssm_outputs.append(scan_out_p) if has_decode: + if prefix_caching_enabled: + state_indices_tensor_d_input = state_indices_tensor_d.gather( + 1, block_idx_last_computed_token_d.unsqueeze(1) + ).squeeze(1) + state_indices_tensor_d_output = state_indices_tensor_d.gather( + 1, block_idx_last_scheduled_token_d.unsqueeze(1) + ).squeeze(1) + else: + state_indices_tensor_d_input = state_indices_tensor_d + state_indices_tensor_d_output = state_indices_tensor_d # 2. Convolution sequence transformation conv_out_d = causal_conv1d_update( hidden_states_BC_d.transpose(0, 1), @@ -343,6 +387,8 @@ class MambaMixer(MambaBase, CustomOp): self.conv1d.bias, self.activation, conv_state_indices=state_indices_tensor_d, + block_idx_last_scheduled_token=block_idx_last_scheduled_token_d, + initial_state_idx=block_idx_last_computed_token_d, ).transpose(0, 1) # 3. State Space Model sequence transformation. @@ -364,7 +410,8 @@ class MambaMixer(MambaBase, CustomOp): gate_d.transpose(0, 1), time_proj_bias, dt_softplus=True, - state_batch_indices=state_indices_tensor_d, + state_batch_indices=state_indices_tensor_d_input, + dst_state_batch_indices=state_indices_tensor_d_output, out=scan_outputs_d, ) scan_outputs_d = scan_outputs_d.transpose(0, 1) @@ -423,20 +470,14 @@ class PrefillDecodeSplit(NamedTuple): gate_d: torch.Tensor state_indices_tensor_p: torch.Tensor state_indices_tensor_d: torch.Tensor - query_start_loc_p: torch.Tensor | None - has_initial_states_p: torch.Tensor | None def split_batch_to_prefill_and_decode( hidden_states_BC: torch.Tensor, gate: torch.Tensor, state_indices_tensor: torch.Tensor, - query_start_loc: torch.Tensor, - has_initial_states: torch.Tensor | None, num_prefill_tokens: int, - num_decode_tokens: int, num_prefills: int, - num_decodes: int, num_padded_decodes: int, ) -> PrefillDecodeSplit: num_actual_tokens = num_prefill_tokens + num_padded_decodes @@ -457,16 +498,6 @@ def split_batch_to_prefill_and_decode( [num_padded_decodes, num_prefills], dim=0, ) - query_start_loc_p = ( - query_start_loc[-num_prefills - 1 :] - num_padded_decodes - if num_prefills > 0 - else None - ) - has_initial_states_p = ( - has_initial_states[-num_prefills:] - if (has_initial_states is not None and num_prefills > 0) - else None - ) return PrefillDecodeSplit( hidden_states_BC_p=hidden_states_BC_p, @@ -475,8 +506,6 @@ def split_batch_to_prefill_and_decode( gate_d=gate_d, state_indices_tensor_p=state_indices_tensor_p, state_indices_tensor_d=state_indices_tensor_d, - query_start_loc_p=query_start_loc_p, - has_initial_states_p=has_initial_states_p, ) diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py index 8722eb9a7b22f..53fd5d5458b09 100644 --- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py +++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py @@ -375,6 +375,10 @@ def selective_scan_fn( cache_indices=None, has_initial_state=None, pad_slot_id=PAD_SLOT_ID, + block_size=1024, + block_idx_first_scheduled_token=None, + block_idx_last_scheduled_token=None, + initial_state_idx=None, ) -> torch.Tensor: """ u: (dim, total_length) for varlen or (batch, dim, seqlen) @@ -397,7 +401,10 @@ def selective_scan_fn( x.shape=(dim,17) cache_indices: (batch) int32 A tensor with each cell is a correspondent - input and output ssm_state index + input and output ssm_state indices + - Without APC: (batch,) - single state index per batch item + - With APC: (batch, max_positions) - cache block indices for read/write + Each non-zero value indicates a cache block to load from and/or write to. has_initial_state: (batch) bool A tensor populated with ones and zeros, indicate if the ssm_state at the corresponding index should be @@ -408,6 +415,17 @@ def selective_scan_fn( that will not be processed, for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id] in this case, the kernel will not process entries at indices 0 and 3 + block_size: int + The block size to align the cached states to + block_idx_first_scheduled_token: (batch,), dtype int32 + The pointer into cache_indices, where the first + cache block to be filled is located. + block_idx_last_scheduled_token: (batch,), dtype int32 + The pointer into cache_indices, where the last cache block + to be filled is located. + initial_state_idx: (batch,), dtype int32 + The pointer into cache_indices, where the cache block + containing the initial state is located. returns output: (dim, total_length) for varlen or (batch, dim, seqlen) supports inplace replacement @@ -448,6 +466,10 @@ def selective_scan_fn( has_initial_state, ssm_states, pad_slot_id, + block_size, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + initial_state_idx, ) if z is None: diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 7150977e9266b..5dda2ec97875f 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -299,7 +299,7 @@ class MambaModelConfig(VerifyAndUpdateConfig): if model_config.supports_mamba_prefix_caching: logger.info( "Warning: Prefix caching is currently enabled. " - "Its support for Mamba2 layers is experimental. " + "Its support for Mamba layers is experimental. " "Please report any issues you may observe." ) else: diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index f8a87cf6965f8..ba95021b0b542 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -38,7 +38,13 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaMLP as JambaMLP from vllm.sequence import IntermediateTensors -from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP +from .interfaces import ( + HasInnerState, + IsHybrid, + SupportsLoRA, + SupportsMambaPrefixCaching, + SupportsPP, +) from .utils import ( AutoWeightsLoader, WeightsMapper, @@ -454,7 +460,14 @@ class JambaModel(nn.Module): return loaded_params -class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid): +class JambaForCausalLM( + nn.Module, + HasInnerState, + SupportsLoRA, + SupportsPP, + IsHybrid, + SupportsMambaPrefixCaching, +): hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={".self_attn.": ".", ".A_log": ".A"}, ) @@ -477,12 +490,8 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHyb def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config lora_config = vllm_config.lora_config scheduler_config = vllm_config.scheduler_config - assert not cache_config.enable_prefix_caching, ( - "Jamba currently does not support prefix caching" - ) super().__init__() self.config = config diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index fb145289fbfe9..f684203f6d35e 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -29,6 +29,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import ( HasInnerState, IsAttentionFree, + SupportsMambaPrefixCaching, SupportsPP, ) from vllm.sequence import IntermediateTensors @@ -193,15 +194,13 @@ class MambaModel(nn.Module): return loaded_params -class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP): +class MambaForCausalLM( + nn.Module, HasInnerState, IsAttentionFree, SupportsPP, SupportsMambaPrefixCaching +): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config lora_config = vllm_config.lora_config self.scheduler_config = vllm_config.scheduler_config - assert not cache_config.enable_prefix_caching, ( - "Mamba does not support prefix caching" - ) super().__init__() self.config = config diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index 30c63e0ded8e7..909af09be255a 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -7,11 +7,13 @@ import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.config import VllmConfig from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, split_decodes_and_prefills, ) +from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec class Mamba1AttentionBackend(AttentionBackend): @@ -22,32 +24,41 @@ class Mamba1AttentionBackend(AttentionBackend): @dataclass class Mamba1AttentionMetadata: - query_start_loc: torch.Tensor - context_lens_tensor: torch.Tensor + query_start_loc_p: torch.Tensor state_indices_tensor: torch.Tensor - has_initial_states: torch.Tensor | None + has_initial_states_p: torch.Tensor | None num_prefills: int num_prefill_tokens: int num_decodes: int num_decode_tokens: int num_padded_decodes: int + block_idx_last_scheduled_token: torch.Tensor # shape: [batch,] + block_idx_first_scheduled_token_p: torch.Tensor # shape: [batch,] + block_idx_last_computed_token: torch.Tensor # shape: [batch,] + num_computed_tokens_p: torch.Tensor # shape: [batch,] + class Mamba1AttentionMetadataBuilder( BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata] ): + def __init__( + self, + kv_cache_spec: AttentionSpec, + layer_names: list[str], + vllm_config: VllmConfig, + device: torch.device, + ): + super().__init__(kv_cache_spec, layer_names, vllm_config, device) + assert isinstance(kv_cache_spec, MambaSpec) + def build( self, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, fast_build: bool = False, ) -> Mamba1AttentionMetadata: - query_start_loc = common_attn_metadata.query_start_loc - - state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] - context_lens_tensor = common_attn_metadata.num_computed_tokens_cpu.to( - query_start_loc.device - ) + num_reqs = common_attn_metadata.num_reqs num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( split_decodes_and_prefills( @@ -55,32 +66,100 @@ class Mamba1AttentionMetadataBuilder( ) ) - has_initial_states = None + has_initial_states_p = None + query_start_loc_p = None padded_decodes = num_decodes + num_computed_tokens, num_computed_tokens_p = None, None + block_idx_first_scheduled_token = None + block_idx_first_scheduled_token_p = None + + # TODO(@Josephasafg) Mamba1 and Mamba2 have a lot of code in common here. + # We should consolidate this code + if self.vllm_config.cache_config.enable_prefix_caching: + # Return a tensor of shape (#requests, #max blocks) + state_indices_tensor = common_attn_metadata.block_table_tensor + mamba_block_size = self.kv_cache_spec.block_size + num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to( + self.device + ) + ( + block_idx_last_computed_token, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + ) = self._compute_prefix_caching_block_indices( + common_attn_metadata, mamba_block_size + ) + else: + # Always return just a single block per each request: + state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] + block_idx_last_scheduled_token = None + block_idx_last_computed_token = None if num_prefills > 0: - has_initial_states = context_lens_tensor > 0 + query_start_loc_p = ( + common_attn_metadata.query_start_loc[-num_prefills - 1 :] + - num_decode_tokens + ) + has_initial_states_cpu = ( + common_attn_metadata.num_computed_tokens_cpu[ + num_reqs - num_prefills : num_reqs + ] + > 0 + ) + has_initial_states_p = has_initial_states_cpu.to( + common_attn_metadata.query_start_loc.device + ) + + if self.vllm_config.cache_config.enable_prefix_caching: + assert num_computed_tokens is not None + num_computed_tokens_p = num_computed_tokens[ + num_reqs - num_prefills : num_reqs + ] + assert block_idx_first_scheduled_token is not None + block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[ + num_reqs - num_prefills : num_reqs + ] + elif ( num_decodes > 0 and num_decodes <= self.decode_cudagraph_max_bs and self.compilation_config.full_cuda_graph ): - state_indices_for_decode = state_indices_tensor[:num_decodes] padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes) self.state_indices_tensor[:num_decodes].copy_( - state_indices_for_decode, non_blocking=True + state_indices_tensor, non_blocking=True ) state_indices_tensor = self.state_indices_tensor[:padded_decodes] state_indices_tensor[num_decodes:] = PAD_SLOT_ID + if self.vllm_config.cache_config.enable_prefix_caching: + self.block_idx_last_scheduled_token[:num_decodes].copy_( + block_idx_last_scheduled_token, non_blocking=True + ) + block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[ + :padded_decodes + ] + block_idx_last_scheduled_token[num_decodes:] = 0 + + self.block_idx_last_computed_token[:num_decodes].copy_( + block_idx_last_computed_token, non_blocking=True + ) + block_idx_last_computed_token = self.block_idx_last_computed_token[ + :padded_decodes + ] + block_idx_last_computed_token[num_decodes:] = 0 + return Mamba1AttentionMetadata( - query_start_loc=query_start_loc, - context_lens_tensor=context_lens_tensor, - has_initial_states=has_initial_states, + query_start_loc_p=query_start_loc_p, + has_initial_states_p=has_initial_states_p, state_indices_tensor=state_indices_tensor, num_prefills=num_prefills, num_prefill_tokens=num_prefill_tokens, num_decodes=num_decodes, num_decode_tokens=num_decode_tokens, num_padded_decodes=padded_decodes, + block_idx_last_scheduled_token=block_idx_last_scheduled_token, + block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p, + block_idx_last_computed_token=block_idx_last_computed_token, + num_computed_tokens_p=num_computed_tokens_p, ) diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index f9d2426eaf632..4bc1057333a50 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -147,27 +147,6 @@ class Mamba2AttentionMetadataBuilder( assert self.chunk_size is not None, ( "chunk_size needs to be set in the model config for Mamba2 models" ) - if self.vllm_config.cache_config.enable_prefix_caching: - self.state_indices_tensor = torch.empty( - ( - self.decode_cudagraph_max_bs, - cdiv( - vllm_config.model_config.max_model_len, kv_cache_spec.block_size - ), - ), - dtype=torch.int32, - device=device, - ) - self.block_idx_last_scheduled_token = torch.empty( - (self.decode_cudagraph_max_bs,), - dtype=torch.int32, - device=device, - ) - self.block_idx_last_computed_token = torch.empty( - (self.decode_cudagraph_max_bs,), - dtype=torch.int32, - device=device, - ) def build( self, @@ -202,20 +181,13 @@ class Mamba2AttentionMetadataBuilder( num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to( self.device ) - # Block index of the last computed token - block_idx_last_computed_token = ( - cdiv(num_computed_tokens, mamba_block_size) - 1 + ( + block_idx_last_computed_token, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + ) = self._compute_prefix_caching_block_indices( + common_attn_metadata, mamba_block_size ) - # which is <= block index for the first scheduled token - block_idx_first_scheduled_token = ( - cdiv(num_computed_tokens + 1, mamba_block_size) - 1 - ) - # which is <= block index of the last scheduled token - block_idx_last_scheduled_token = ( - cdiv(common_attn_metadata.seq_lens, mamba_block_size) - 1 - ) - # -1 in case it's non-computed and causes later issues with indexing - block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0) else: # Always return just a single block per each request: state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index 52f26a9e61cab..49d7d6c31b9a0 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -7,6 +7,7 @@ from typing import ClassVar, TypeVar import torch from vllm.config import VllmConfig +from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, @@ -38,11 +39,35 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): self.vllm_config.scheduler_config.max_num_seqs, self.compilation_config.max_cudagraph_capture_size, ) - self.state_indices_tensor = torch.empty( - (self.decode_cudagraph_max_bs,), - dtype=torch.int32, - device=device, - ) + + if self.vllm_config.cache_config.enable_prefix_caching: + self.state_indices_tensor = torch.empty( + ( + self.decode_cudagraph_max_bs, + cdiv( + self.vllm_config.model_config.max_model_len, + self.kv_cache_spec.block_size, + ), + ), + dtype=torch.int32, + device=device, + ) + self.block_idx_last_scheduled_token = torch.empty( + (self.decode_cudagraph_max_bs,), + dtype=torch.int32, + device=device, + ) + self.block_idx_last_computed_token = torch.empty( + (self.decode_cudagraph_max_bs,), + dtype=torch.int32, + device=device, + ) + else: + self.state_indices_tensor = torch.empty( + (self.decode_cudagraph_max_bs,), + dtype=torch.int32, + device=device, + ) def build_for_cudagraph_capture( self, common_attn_metadata: CommonAttentionMetadata @@ -61,3 +86,30 @@ class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC): m.max_query_len = 1 # decode-only return self.build(0, m) + + def _compute_prefix_caching_block_indices( + self, + common_attn_metadata: CommonAttentionMetadata, + mamba_block_size: int, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to( + self.device + ) + # Block index of the last computed token + block_idx_last_computed_token = cdiv(num_computed_tokens, mamba_block_size) - 1 + # which is <= block index for the first scheduled token + block_idx_first_scheduled_token = ( + cdiv(num_computed_tokens + 1, mamba_block_size) - 1 + ) + # which is <= block index of the last scheduled token + block_idx_last_scheduled_token = ( + cdiv(common_attn_metadata.seq_lens, mamba_block_size) - 1 + ) + # -1 in case it's non-computed and causes later issues with indexing + block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0) + + return ( + block_idx_last_computed_token, + block_idx_first_scheduled_token, + block_idx_last_scheduled_token, + ) From 6c317a656eb09a641d85be05aa8498ff160bf0c1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 2 Nov 2025 21:42:38 +0800 Subject: [PATCH 082/976] [Misc] Provide Siglip2 chat template (#27939) Signed-off-by: DarkLight1337 --- vllm/transformers_utils/chat_templates/registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py index 3bdbe1d0a67b6..fe84b6c152eef 100644 --- a/vllm/transformers_utils/chat_templates/registry.py +++ b/vllm/transformers_utils/chat_templates/registry.py @@ -40,6 +40,7 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = { "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja", "qwen": _get_qwen_chat_template_fallback, "siglip": CHAT_TEMPLATES_DIR / "template_basic.jinja", + "siglip2": CHAT_TEMPLATES_DIR / "template_basic.jinja", } From 0ce743f4e1879ffa250e471f6894633ef125418e Mon Sep 17 00:00:00 2001 From: Vensen Date: Mon, 3 Nov 2025 00:24:01 +0800 Subject: [PATCH 083/976] Fix(llm): Abort orphaned requests when llm.chat() batch fails Fixes #26081 (#27420) Signed-off-by: vensenmu --- tests/entrypoints/llm/test_chat.py | 53 ++++++++++++++++++++++++++++++ vllm/entrypoints/llm.py | 36 ++++++++++++-------- 2 files changed, 75 insertions(+), 14 deletions(-) diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index b2a958a992a62..a9698632b82e0 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -6,6 +6,7 @@ import pytest from vllm import LLM from vllm.distributed import cleanup_dist_env_and_memory +from vllm.sampling_params import SamplingParams from ..openai.test_vision import TEST_IMAGE_ASSETS @@ -23,6 +24,29 @@ def text_llm(): cleanup_dist_env_and_memory() +@pytest.fixture(scope="function") +def llm_for_failure_test(): + """ + Fixture for testing issue #26081. + Uses a small max_model_len to easily trigger length errors. + """ + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + seed=0, + max_model_len=128, + disable_log_stats=True, + ) + + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + def test_chat(text_llm): prompt1 = "Explain the concept of entropy." messages = [ @@ -157,3 +181,32 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking): else: # The chat template includes dummy thinking process assert think_id in prompt_token_ids + + +def test_chat_batch_failure_cleanup(llm_for_failure_test): + """ + Tests that if a batch call to llm.chat() fails mid-way + (e.g., due to one invalid prompt), the requests that + were already enqueued are properly aborted and do not + pollute the queue for subsequent calls. + (Fixes Issue #26081) + """ + llm = llm_for_failure_test + valid_msg = [{"role": "user", "content": "Hello"}] + long_text = "This is a very long text to test the error " * 50 + invalid_msg = [{"role": "user", "content": long_text}] + batch_1 = [ + valid_msg, + valid_msg, + invalid_msg, + ] + batch_2 = [ + valid_msg, + valid_msg, + ] + sampling_params = SamplingParams(temperature=0, max_tokens=10) + with pytest.raises(ValueError, match="longer than the maximum model length"): + llm.chat(batch_1, sampling_params=sampling_params) + outputs_2 = llm.chat(batch_2, sampling_params=sampling_params) + assert len(outputs_2) == len(batch_2) + assert llm.llm_engine.get_num_unfinished_requests() == 0 diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index b0b996ab2fec5..22fe2ae9280aa 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1588,20 +1588,27 @@ class LLM: tqdm_func = use_tqdm if callable(use_tqdm) else tqdm it = tqdm_func(it, desc="Adding requests") - for i, prompt in enumerate(it): - if isinstance(prompt, dict): - self._validate_mm_data_and_uuids( - prompt.get("multi_modal_data"), prompt.get("multi_modal_uuids") - ) + added_request_ids: list[str] = [] - self._add_request( - prompt, - params[i] if isinstance(params, Sequence) else params, - lora_request=lora_request[i] - if isinstance(lora_request, Sequence) - else lora_request, - priority=priority[i] if priority else 0, - ) + try: + for i, prompt in enumerate(it): + if isinstance(prompt, dict): + self._validate_mm_data_and_uuids( + prompt.get("multi_modal_data"), prompt.get("multi_modal_uuids") + ) + request_id = self._add_request( + prompt, + params[i] if isinstance(params, Sequence) else params, + lora_request=lora_request[i] + if isinstance(lora_request, Sequence) + else lora_request, + priority=priority[i] if priority else 0, + ) + added_request_ids.append(request_id) + except Exception as e: + if added_request_ids: + self.llm_engine.abort_request(added_request_ids) + raise e def _validate_mm_data_and_uuids( self, @@ -1684,7 +1691,7 @@ class LLM: params: SamplingParams | PoolingParams, lora_request: LoRARequest | None = None, priority: int = 0, - ) -> None: + ) -> str: prompt_text, _, _ = get_prompt_components(prompt) request_id = str(next(self.request_counter)) @@ -1705,6 +1712,7 @@ class LLM: priority=priority, prompt_text=prompt_text, ) + return request_id def _run_engine( self, *, use_tqdm: bool | Callable[..., tqdm] = True From 1bf43ae35d7f6a83cc2025b8c0a2332456f4afe9 Mon Sep 17 00:00:00 2001 From: Biswa Panda Date: Sun, 2 Nov 2025 18:08:08 -0800 Subject: [PATCH 084/976] [BugFix][LoRA] use adapter_id instead of id field of lora_request (#27728) Signed-off-by: Biswa Panda --- tests/v1/core/test_prefix_caching.py | 63 +++++++++++++++++++++++++++- vllm/v1/core/block_pool.py | 4 +- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 837a513cb75e1..2291f363731f2 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -9,7 +9,8 @@ import pytest import torch import vllm.v1.core.kv_cache_utils as kv_cache_utils -from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved +from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved, BlockStored +from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import ( MultiModalFeatureSpec, MultiModalKwargsItem, @@ -59,6 +60,7 @@ def make_request( mm_hashes: list[str] | None = None, prompt_logprobs: int | None = None, cache_salt: str | None = None, + lora_request: LoRARequest | None = None, ): mm_features = [] if mm_positions is not None: @@ -79,7 +81,7 @@ def make_request( sampling_params=SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs), pooling_params=None, eos_token_id=100, - lora_request=None, + lora_request=lora_request, cache_salt=cache_salt, block_hasher=get_request_block_hasher(block_size, hash_fn), ) @@ -1337,6 +1339,63 @@ def test_kv_cache_events(blocks_to_cache: int): assert len(manager.block_pool.cached_block_hash_to_block) == 0 +@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10]) +def test_kv_cache_events_with_lora(blocks_to_cache: int): + """Test BlockStored events contain correct lora_id when using LoRA requests.""" + block_size = 16 + num_blocks = blocks_to_cache + 1 + + # Create KVCacheManager with events enabled + manager = KVCacheManager( + make_kv_cache_config(block_size, num_blocks), + max_model_len=8192, + enable_caching=True, + enable_kv_cache_events=True, + ) + + # Test with LoRA request + lora_request = LoRARequest( + lora_name="test_lora", lora_int_id=42, lora_path="/test/path" + ) + + num_tokens = block_size * blocks_to_cache + req_with_lora = make_request( + "lora_req", + list(range(num_tokens)), + block_size, + sha256, + lora_request=lora_request, + ) + + # Allocate slots and get events + _ = manager.allocate_slots(req_with_lora, num_tokens) + events = manager.take_events() + + # Verify BlockStored event contains correct lora_id + block_stored_event = events[-1] + assert isinstance(block_stored_event, BlockStored) + assert block_stored_event.lora_id == 42 # Should match lora_request.adapter_id + assert len(block_stored_event.block_hashes) == blocks_to_cache + assert block_stored_event.block_size == block_size + + # Clean up + manager.free(req_with_lora) + + # Test without LoRA request (should have lora_id=None) + req_without_lora = make_request( + "no_lora_req", list(range(num_tokens)), block_size, sha256 + ) + + _ = manager.allocate_slots(req_without_lora, num_tokens) + events = manager.take_events() + + block_stored_event = events[-1] + assert isinstance(block_stored_event, BlockStored) + assert block_stored_event.lora_id is None # Should be None when no LoRA request + assert len(block_stored_event.block_hashes) == blocks_to_cache + assert block_stored_event.block_size == block_size + + def test_eagle_enabled_removes_last_block(): """Verify Eagle does NOT remove blocks when request length is divisible by block size.""" diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 15c06a0b107d8..55710ad5cc693 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -259,7 +259,9 @@ class BlockPool: num_cached_blocks * block_size : num_full_blocks * block_size ], block_size=block_size, - lora_id=request.lora_request.id if request.lora_request else None, + lora_id=request.lora_request.adapter_id + if request.lora_request + else None, medium=MEDIUM_GPU, ) ) From 470ad118b6238e66094c9a508dea0aaaaf864093 Mon Sep 17 00:00:00 2001 From: Sungyoon Jeong <157349761+n0gu-furiosa@users.noreply.github.com> Date: Mon, 3 Nov 2025 13:21:18 +0900 Subject: [PATCH 085/976] [Frontend] Align finish_reason when tool is called with OpenAI (#25054) Signed-off-by: Sungyoon Jeong Co-authored-by: Chauncey --- vllm/entrypoints/openai/serving_chat.py | 26 +++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index bb770ecf03383..25979d5502b07 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1170,9 +1170,13 @@ class OpenAIServingChat(OpenAIServing): ) # Send the finish response for each request.n only once + # In OpenAI's API, when a tool is called, the + # finish_reason is: + # "tool_calls" for "auto" or "required" tool calls, + # and "stop" for named tool calls. if ( auto_tools_called - or tools_streamed[i] + or (tools_streamed[i] and not tool_choice_function_name) or (self.use_harmony and harmony_tools_streamed[i]) ): finish_reason_ = "tool_calls" @@ -1523,18 +1527,24 @@ class OpenAIServingChat(OpenAIServing): message = ChatMessage( role=role, reasoning_content=reasoning_content, content=content ) + # In OpenAI's API, when a tool is called, the finish_reason is: + # "tool_calls" for "auto" or "required" tool calls, + # and "stop" for named tool calls. + is_finish_reason_tool_calls = auto_tools_called or ( + request.tool_choice + and request.tool_choice == "required" + and output.finish_reason == "stop" + ) choice_data = ChatCompletionResponseChoice( index=output.index, message=message, logprobs=logprobs, - finish_reason=( - "tool_calls" - if auto_tools_called - else output.finish_reason - if output.finish_reason - else "stop" - ), + finish_reason="tool_calls" + if is_finish_reason_tool_calls + else output.finish_reason + if output.finish_reason + else "stop", stop_reason=output.stop_reason, token_ids=( as_list(output.token_ids) if request.return_token_ids else None From 18961c5ea62976efc50525b72e40337993c5e4f9 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Mon, 3 Nov 2025 06:48:03 +0100 Subject: [PATCH 086/976] [Hybrid] Pass kernel block size to builders (#27753) Signed-off-by: Thomas Parnell --- vllm/v1/attention/backends/flash_attn.py | 6 +++- vllm/v1/kv_cache_interface.py | 8 ++++- vllm/v1/worker/gpu_model_runner.py | 31 +++++++++++++---- vllm/v1/worker/utils.py | 44 ++++++++++++++---------- 4 files changed, 62 insertions(+), 27 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 1eac94940e781..07f9ef173b4e3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -62,7 +62,11 @@ class FlashAttentionBackend(AttentionBackend): @staticmethod def get_supported_kernel_block_size() -> list[int | MultipleOf]: - return [MultipleOf(16)] + # NOTE(tdoublep): while in principle, FA supports + # MultipleOf(16), these are the block sizes that do not + # suffer from the NaN propagation problem described here: + # https://github.com/Dao-AILab/flash-attention/issues/1974 + return [16, 32, 64] @classmethod def validate_head_size(cls, head_size: int) -> None: diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 0f564fdb3b080..7f33eb7e699c7 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -from dataclasses import dataclass, fields +from dataclasses import dataclass, fields, replace from math import prod import torch @@ -44,6 +44,12 @@ class KVCacheSpec: """ raise NotImplementedError + def copy_with_new_block_size(self, block_size: int) -> Self: + """ + Create a new KVCacheSpec from self but replacing the block size. + """ + return replace(self, block_size=block_size) + @classmethod def merge(cls, specs: list[Self]) -> Self: """ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 66a9d72912618..9212221bb6009 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4039,16 +4039,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) -> list[AttentionGroup]: attn_groups: list[AttentionGroup] = [] for (attn_backend, kv_cache_spec), layer_names in attn_backends_map.items(): - attn_group = AttentionGroup.create_with_metadata_builders( + attn_group = AttentionGroup( attn_backend, layer_names, kv_cache_spec, - self.vllm_config, - self.device, kv_cache_group_id, - num_metadata_builders=1 - if not self.parallel_config.enable_dbo - else 2, ) attn_groups.append(attn_group) @@ -4067,7 +4062,27 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): for i, attn_backend_map in enumerate(attention_backend_maps): self.attn_groups.append(create_attn_groups(attn_backend_map, i)) + def initialize_metadata_builders( + self, kv_cache_config: KVCacheConfig, kernel_block_sizes: list[int] + ) -> None: + """ + Create the metadata builders for all KV cache groups and attn groups. + """ + for kv_cache_group_id in range(len(kv_cache_config.kv_cache_groups)): + for attn_group in self.attn_groups[kv_cache_group_id]: + attn_group.create_metadata_builders( + self.vllm_config, + self.device, + kernel_block_sizes[kv_cache_group_id] + if kv_cache_group_id < len(kernel_block_sizes) + else None, + num_metadata_builders=1 + if not self.parallel_config.enable_dbo + else 2, + ) # Calculate reorder batch threshold (if needed) + # Note (tdoublep): do this *after* constructing builders, + # because some of them change the threshold at init time. self.calculate_reorder_batch_threshold() def _check_and_update_cudagraph_mode( @@ -4633,6 +4648,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # kernel_block_size 64 and split the 256-token-block to 4 blocks with 64 # tokens each. kernel_block_sizes = self._prepare_kernel_block_sizes(kv_cache_config) + + # create metadata builders + self.initialize_metadata_builders(kv_cache_config, kernel_block_sizes) + # Reinitialize need to after initialize_attn_backend self.may_reinitialize_input_batch(kv_cache_config, kernel_block_sizes) kv_caches = self.initialize_kv_cache_tensors( diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 396adbcfb289f..0ca7e81a5c7b8 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import TYPE_CHECKING import torch @@ -134,31 +134,37 @@ class MultiModalBudget: @dataclass class AttentionGroup: backend: type[AttentionBackend] - # When ubatching is enabled we will have a metadata builder for each ubatch - # so that if they use internal persistant buffers for cudagraphs, and they - # won't have to worry about conflicting with the other ubatches. - metadata_builders: list[AttentionMetadataBuilder] layer_names: list[str] kv_cache_spec: KVCacheSpec kv_cache_group_id: int + # When ubatching is enabled we will have a metadata builder for each ubatch + # so that if they use internal persistant buffers for cudagraphs, and they + # won't have to worry about conflicting with the other ubatches. + metadata_builders: list[AttentionMetadataBuilder] = field( + default_factory=lambda: [] + ) - @staticmethod - def create_with_metadata_builders( - backend: type[AttentionBackend], - layer_names: list[str], - kv_cache_spec: KVCacheSpec, - vllm_config: VllmConfig, - device: torch.device, - kv_cache_group_id: int, + def create_metadata_builders( + self, + vllm_config, + device, + kernel_block_size: int | None, num_metadata_builders: int = 1, - ) -> "AttentionGroup": - metadata_builders = [ - backend.get_builder_cls()(kv_cache_spec, layer_names, vllm_config, device) + ): + kv_cache_spec_builder = ( + self.kv_cache_spec.copy_with_new_block_size(kernel_block_size) + if kernel_block_size is not None + else self.kv_cache_spec + ) + self.metadata_builders = [ + self.backend.get_builder_cls()( + kv_cache_spec_builder, + self.layer_names, + vllm_config, + device, + ) for _ in range(num_metadata_builders) ] - return AttentionGroup( - backend, metadata_builders, layer_names, kv_cache_spec, kv_cache_group_id - ) def get_metadata_builder(self, ubatch_id: int = 0) -> AttentionMetadataBuilder: assert len(self.metadata_builders) > ubatch_id From cec7c288333339028f6fe8e0ac3222e3924da90b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Delacourt?= <54138269+Flechman@users.noreply.github.com> Date: Mon, 3 Nov 2025 08:22:46 +0100 Subject: [PATCH 087/976] [Bugfix] Padded Eagle Specdec with Chunked Prefill (#26263) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rémi Delacourt Signed-off-by: Rémi Delacourt <54138269+Flechman@users.noreply.github.com> Signed-off-by: remi Co-authored-by: Benjamin Chislett --- tests/v1/e2e/test_spec_decode.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 45b48e5858934..ea7fcdf3174ec 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -202,9 +202,9 @@ def test_speculators_model_integration( @pytest.mark.parametrize( - ["model_setup", "mm_enabled"], + ["model_setup", "mm_enabled", "chunked_prefill_enabled"], [ - (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False), + (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False), pytest.param( ( "eagle3", @@ -213,11 +213,12 @@ def test_speculators_model_integration( 1, ), False, + False, marks=pytest.mark.skip( reason="Skipping due to its head_dim not being a a multiple of 32" ), ), - ( + pytest.param( ( "eagle", "meta-llama/Llama-3.1-8B-Instruct", @@ -225,7 +226,9 @@ def test_speculators_model_integration( 1, ), False, - ), + True, + marks=large_gpu_mark(min_gb=40), + ), # works on 4x H100 ( ( "eagle3", @@ -234,6 +237,7 @@ def test_speculators_model_integration( 1, ), False, + False, ), pytest.param( ( @@ -243,6 +247,7 @@ def test_speculators_model_integration( 4, ), False, + False, marks=large_gpu_mark(min_gb=80), ), # works on 4x H100 pytest.param( @@ -253,6 +258,7 @@ def test_speculators_model_integration( 4, ), True, + True, marks=large_gpu_mark(min_gb=80), ), # works on 4x H100 ( @@ -263,6 +269,7 @@ def test_speculators_model_integration( 1, ), False, + False, ), ], ids=[ @@ -281,6 +288,7 @@ def test_eagle_correctness( sampling_config: SamplingParams, model_setup: tuple[str, str, str, int], mm_enabled: bool, + chunked_prefill_enabled: bool, attn_backend: str, ): if attn_backend == "TREE_ATTN": @@ -317,9 +325,13 @@ def test_eagle_correctness( m.setenv("VLLM_ROCM_USE_AITER", "1") method, model_name, spec_model_name, tp_size = model_setup + max_model_len = 2048 + max_num_batched_tokens = max_model_len + if chunked_prefill_enabled: + max_num_batched_tokens = 128 ref_llm = LLM( - model=model_name, max_model_len=2048, tensor_parallel_size=tp_size + model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size ) ref_outputs = ref_llm.chat(test_prompts, sampling_config) del ref_llm @@ -334,9 +346,11 @@ def test_eagle_correctness( "method": method, "model": spec_model_name, "num_speculative_tokens": 3, - "max_model_len": 2048, + "max_model_len": max_model_len, }, - max_model_len=2048, + max_model_len=max_model_len, + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=chunked_prefill_enabled, ) spec_outputs = spec_llm.chat(test_prompts, sampling_config) matches = 0 From 7f4bdadb926936a11a88a619f56634061e824798 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 3 Nov 2025 15:36:59 +0800 Subject: [PATCH 088/976] [XPU]Refine Dockerfile.xpu, avoid oneccl dependency issue (#27964) Signed-off-by: Kunshang Ji --- docker/Dockerfile.xpu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 49ea39cad5128..4e6ef8f5ca13c 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -54,7 +54,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ - python3 setup.py install + pip install --no-build-isolation . CMD ["/bin/bash"] @@ -64,9 +64,6 @@ FROM vllm-base AS vllm-openai RUN --mount=type=cache,target=/root/.cache/pip \ pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope -RUN --mount=type=cache,target=/root/.cache/pip \ - pip uninstall oneccl oneccl-devel -y - # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils @@ -74,4 +71,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/" +RUN --mount=type=cache,target=/root/.cache/pip \ + pip uninstall oneccl oneccl-devel -y + ENTRYPOINT ["vllm", "serve"] From ba464e6ae24857b2db7c82f4123342b9ab90049e Mon Sep 17 00:00:00 2001 From: Misha Efimov Date: Mon, 3 Nov 2025 03:21:31 -0500 Subject: [PATCH 089/976] Add ORCA endpoint load metrics support (#24905) Signed-off-by: Misha Efimov --- tests/entrypoints/openai/test_orca_metrics.py | 128 ++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 19 ++- vllm/entrypoints/openai/orca_metrics.py | 120 ++++++++++++++++ 3 files changed, 265 insertions(+), 2 deletions(-) create mode 100644 tests/entrypoints/openai/test_orca_metrics.py create mode 100644 vllm/entrypoints/openai/orca_metrics.py diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py new file mode 100644 index 0000000000000..d32cfde07c21e --- /dev/null +++ b/tests/entrypoints/openai/test_orca_metrics.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import openai +import pytest +import pytest_asyncio + +from ...utils import RemoteOpenAIServer + +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" + + +@pytest.fixture(scope="module") +def monkeypatch_module(): + from _pytest.monkeypatch import MonkeyPatch + + mpatch = MonkeyPatch() + yield mpatch + mpatch.undo() + + +@pytest.fixture(scope="module", params=[True]) +def server(request, monkeypatch_module): + use_v1 = request.param + monkeypatch_module.setenv("VLLM_USE_V1", "1" if use_v1 else "0") + + args = [ + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_chat_completion_with_orca_header(server: RemoteOpenAIServer): + messages = [ + {"role": "system", "content": "you are a helpful assistant"}, + {"role": "user", "content": "what is 1+1?"}, + ] + + client = openai.OpenAI( + api_key="EMPTY", + base_url=f"http://localhost:{server.port}/v1", + default_headers={"endpoint-load-metrics-format": "TEXT"}, + ) + + # 1. Use raw client to get response headers. + raw_client = client.with_raw_response + + # 2. Make the API call using the raw_client + response_with_raw = raw_client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + extra_headers={"endpoint-load-metrics-format": "TEXT"}, + ) + + # 3. Access the raw httpx.Response object + raw_http_response = response_with_raw.http_response + + # 4. Get the headers from the httpx.Response object + response_headers = raw_http_response.headers + + assert "endpoint-load-metrics" in response_headers + + +@pytest.mark.asyncio +async def test_completion_with_orca_header(client: openai.AsyncOpenAI): + # 1. Use raw client to get response headers. + raw_client = client.with_raw_response + + # 2. Make the API call using the raw_client + completion = await raw_client.completions.create( + model=MODEL_NAME, + prompt="Hello, my name is", + max_tokens=5, + extra_headers={"endpoint-load-metrics-format": "JSON"}, + ) + + # 3. Access the raw httpx.Response object + raw_http_response = completion.http_response + + # 4. Get the headers from the httpx.Response object + response_headers = raw_http_response.headers + + assert "endpoint-load-metrics" in response_headers + + +@pytest.mark.asyncio +async def test_single_completion(client: openai.AsyncOpenAI): + completion = await client.completions.create( + model=MODEL_NAME, + prompt="Hello, my name is", + max_tokens=5, + extra_headers={"endpoint-load-metrics-format": "JSON"}, + temperature=0.0, + ) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + + choice = completion.choices[0] + assert len(choice.text) >= 5 + assert choice.finish_reason == "length" + assert completion.usage == openai.types.CompletionUsage( + completion_tokens=5, prompt_tokens=6, total_tokens=11 + ) + + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + assert len(completion.choices[0].text) >= 1 + assert completion.choices[0].prompt_logprobs is None diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 22b5584749ae7..c37aba2776aeb 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -51,6 +51,7 @@ from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args +from vllm.entrypoints.openai.orca_metrics import metrics_header from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponse, @@ -128,6 +129,8 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) logger = init_logger("vllm.entrypoints.openai.api_server") +ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL = "endpoint-load-metrics-format" + _running_tasks: set[asyncio.Task] = set() @@ -672,6 +675,9 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques @with_cancellation @load_aware_call async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): + metrics_header_format = raw_request.headers.get( + ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, "" + ) handler = chat(raw_request) if handler is None: return base(raw_request).create_error_response( @@ -689,7 +695,10 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re ) elif isinstance(generator, ChatCompletionResponse): - return JSONResponse(content=generator.model_dump()) + return JSONResponse( + content=generator.model_dump(), + headers=metrics_header(metrics_header_format), + ) return StreamingResponse(content=generator, media_type="text/event-stream") @@ -707,6 +716,9 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re @with_cancellation @load_aware_call async def create_completion(request: CompletionRequest, raw_request: Request): + metrics_header_format = raw_request.headers.get( + ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, "" + ) handler = completion(raw_request) if handler is None: return base(raw_request).create_error_response( @@ -729,7 +741,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request): content=generator.model_dump(), status_code=generator.error.code ) elif isinstance(generator, CompletionResponse): - return JSONResponse(content=generator.model_dump()) + return JSONResponse( + content=generator.model_dump(), + headers=metrics_header(metrics_header_format), + ) return StreamingResponse(content=generator, media_type="text/event-stream") diff --git a/vllm/entrypoints/openai/orca_metrics.py b/vllm/entrypoints/openai/orca_metrics.py new file mode 100644 index 0000000000000..3808262bf31f2 --- /dev/null +++ b/vllm/entrypoints/openai/orca_metrics.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Utility functions that create ORCA endpoint load report response headers. +""" + +import json +from collections.abc import Mapping + +from vllm.logger import init_logger +from vllm.v1.metrics.reader import Gauge, get_metrics_snapshot + +logger = init_logger(__name__) + + +def create_orca_header( + metrics_format: str, named_metrics: list[tuple[str, float]] +) -> Mapping[str, str] | None: + """ + Creates ORCA headers named 'endpoint-load-metrics' in the specified format + and adds custom metrics to named_metrics. + ORCA headers format description: https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0 + ORCA proto https://github.com/cncf/xds/blob/main/xds/data/orca/v3/orca_load_report.proto + + Parameters: + - metrics_format (str): The format of the header ('TEXT', 'JSON'). + - named_metrics (List[Tuple[str, float]]): List of tuples with metric names + and their corresponding double values. + + Returns: + - Optional[Mapping[str,str]]: A dictionary with header key as + 'endpoint-load-metrics' and values as the ORCA header strings with + format prefix and data in with named_metrics in. + """ + + if metrics_format.lower() not in ["text", "json"]: + logger.warning( + "Warning: `%s` format is not supported in the ORCA response header", + format, + ) + return None + + header = {} + orca_report = { + "named_metrics": { + metric_name: value + for metric_name, value in named_metrics + if isinstance(metric_name, str) and isinstance(value, float) + } + } + # output example: + # endpoint-load-metrics: TEXT named_metrics.kv_cache_utilization=0.4 + if metrics_format.lower() == "text": + native_http_header = ", ".join( + [ + f"named_metrics.{metric_name}={value}" + for metric_name, value in named_metrics + if isinstance(metric_name, str) and isinstance(value, float) + ] + ) + header["endpoint-load-metrics"] = f"TEXT {native_http_header}" + + # output example: + # endpoint-load-metrics: JSON “named_metrics”: {“custom-metric-util”: 0.4} + elif metrics_format.lower() == "json": + header["endpoint-load-metrics"] = f"JSON {json.dumps(orca_report)}" + + logger.info("Created ORCA header %s", header) + + return header + + +def get_named_metrics_from_prometheus() -> list[tuple[str, float]]: + """ + Collects current metrics from Prometheus and returns some of them + in the form of the `named_metrics` list for `create_orca_header()`. + + Parameters: + - None + + Returns: + - list[tuple[str, float]]: List of tuples of metric names and their values. + """ + named_metrics: list[tuple[str, float]] = [] + # Map from prometheus metric names to ORCA named metrics. + prometheus_to_orca_metrics = { + "vllm:kv_cache_usage_perc": "kv_cache_usage_perc", + "vllm:num_requests_waiting": "num_requests_waiting", + } + metrics = get_metrics_snapshot() + for metric in metrics: + orca_name = prometheus_to_orca_metrics.get(metric.name) + # If this metric is mapped into ORCA, then add it to the report. + # Note: Only Gauge metrics are currently supported. + if orca_name is not None and isinstance(metric, Gauge): + named_metrics.append((str(orca_name), float(metric.value))) + return named_metrics + + +def metrics_header(metrics_format: str) -> Mapping[str, str] | None: + """ + Creates ORCA headers named 'endpoint-load-metrics' in the specified format. + Metrics are collected from Prometheus using `get_named_metrics_from_prometheus()`. + + ORCA headers format description: https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0 + ORCA proto https://github.com/cncf/xds/blob/main/xds/data/orca/v3/orca_load_report.proto + + Parameters: + - metrics_format (str): The format of the header ('TEXT', 'JSON'). + + Returns: + - Optional[Mapping[str,str]]: A dictionary with header key as + 'endpoint-load-metrics' and values as the ORCA header strings with + format prefix and data in with named_metrics in. + """ + if not metrics_format: + return None + # Get named metrics from prometheus. + named_metrics = get_named_metrics_from_prometheus() + return create_orca_header(metrics_format, named_metrics) From 32257297dd4dcb996a0fb4641c2018289d20396b Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 3 Nov 2025 16:50:06 +0800 Subject: [PATCH 090/976] [CI/Build] Remove the flaky gpt-oss lora test (#27966) Signed-off-by: Jee Jee Li --- tests/lora/test_gptoss_tp.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py index db4b7ca5ef499..711d514a39eb3 100644 --- a/tests/lora/test_gptoss_tp.py +++ b/tests/lora/test_gptoss_tp.py @@ -32,7 +32,6 @@ The Competition_ID of competition_record is the foreign key of Competition_ID of ###Response:<|end|><|start|>assistant<|channel|>final<|message|>""" # noqa: E501 EXPECTED_LORA_OUTPUT = [ - "SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;", "SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;", "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;", "SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;", @@ -41,9 +40,6 @@ EXPECTED_LORA_OUTPUT = [ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None: prompts = [ - PROMPT_TEMPLATE.format( - context="What is the average number of working horses of farms with more than 5000 total number of horses?" # noqa: E501 - ), # noqa: E501 PROMPT_TEMPLATE.format( context="Give the average number of working horses on farms with more than 5000 total horses." # noqa: E501 ), # noqa: E501 @@ -67,7 +63,6 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None: generated_text = output.outputs[0].text.strip() generated_texts.append(generated_text) print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - for i in range(len(EXPECTED_LORA_OUTPUT)): assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i]) From 40b69e33e796efdc75e774a1c38cc73397ea6e17 Mon Sep 17 00:00:00 2001 From: zhang-prog <69562787+zhang-prog@users.noreply.github.com> Date: Mon, 3 Nov 2025 19:04:22 +0800 Subject: [PATCH 091/976] [Model] Add PaddleOCR-VL Model Support (#27758) Signed-off-by: zhangyue Signed-off-by: Roger Wang Signed-off-by: Isotr0py Signed-off-by: zhangyue66 Co-authored-by: Roger Wang Co-authored-by: Isotr0py --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 27 + .../vision_language_multi_image.py | 22 + tests/models/registry.py | 4 + vllm/model_executor/models/ernie45.py | 10 + vllm/model_executor/models/paddleocr_vl.py | 1407 +++++++++++++++++ vllm/model_executor/models/registry.py | 4 + 7 files changed, 1475 insertions(+) create mode 100644 vllm/model_executor/models/paddleocr_vl.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index fd25647dce54b..21235e305db4b 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -675,6 +675,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | | `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | | `Ovis2_5` | Ovis2.5 | T + I+ + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | +| `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I+ | `PaddlePaddle/PaddleOCR-VL`, etc. | | | | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index c1ea95f8d0644..371cf6309a678 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1242,6 +1242,32 @@ def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData: ) +# PaddleOCR-VL +def run_paddleocr_vl(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "PaddlePaddle/PaddleOCR-VL" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + limit_mm_per_prompt={modality: 1}, + trust_remote_code=True, + ) + + placeholder = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" + prompts = [ + (f"<|begin_of_sentence|>User: {question}{placeholder}\nAssistant: ") + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # PaliGemma def run_paligemma(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1817,6 +1843,7 @@ model_example_map = { "NVLM_D": run_nvlm_d, "ovis": run_ovis, "ovis2_5": run_ovis2_5, + "paddleocr_vl": run_paddleocr_vl, "paligemma": run_paligemma, "paligemma2": run_paligemma2, "phi3_v": run_phi3v, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 5cb47c15038e8..80c7fc4431229 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -801,6 +801,27 @@ def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_paddleocr_vl(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "PaddlePaddle/PaddleOCR-VL" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + max_num_seqs=2, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" * len(image_urls) + prompt = f"<|begin_of_sentence|>User: {question}{placeholders}\nAssistant: " + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "mistral-community/pixtral-12b" @@ -1312,6 +1333,7 @@ model_example_map = { "NVLM_D": load_nvlm_d, "ovis": load_ovis, "ovis2_5": load_ovis2_5, + "paddleocr_vl": load_paddleocr_vl, "phi3_v": load_phi3v, "phi4_mm": load_phi4mm, "phi4_multimodal": load_phi4_multimodal, diff --git a/tests/models/registry.py b/tests/models/registry.py index 8e1dd4ba91f1d..00fe999805003 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -712,6 +712,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { }, ), "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True), + "PaddleOCRVLForConditionalGeneration": _HfExamplesInfo( + "PaddlePaddle/PaddleOCR-VL", + trust_remote_code=True, + ), "PaliGemmaForConditionalGeneration": _HfExamplesInfo( "google/paligemma-3b-mix-224", extras={"v2": "google/paligemma2-3b-ft-docci-448"}, diff --git a/vllm/model_executor/models/ernie45.py b/vllm/model_executor/models/ernie45.py index b1d26cddcc5eb..c1a4737e1f326 100644 --- a/vllm/model_executor/models/ernie45.py +++ b/vllm/model_executor/models/ernie45.py @@ -23,12 +23,22 @@ # limitations under the License. """Inference-only Erine model compatible with HuggingFace weights.""" +from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.model_executor.models.llama import LlamaForCausalLM from .utils import PPMissingLayer +@support_torch_compile( + # set dynamic_arg_dims to support mrope + dynamic_arg_dims={ + "input_ids": 0, + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + } +) class Ernie4_5ForCausalLM(LlamaForCausalLM): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py new file mode 100644 index 0000000000000..377b41a355782 --- /dev/null +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -0,0 +1,1407 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections.abc import Iterable, Mapping, Sequence +from functools import partial +from typing import Annotated, Literal + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange, repeat +from transformers import BatchFeature, PretrainedConfig +from transformers.activations import GELUActivation +from transformers.modeling_outputs import ( + BaseModelOutputWithPooling, +) +from transformers.utils import torch_int + +from vllm.attention.backends.registry import _Backend +from vllm.attention.layer import ( + check_upstream_fa_availability, + maybe_get_vit_flash_attn_backend, +) +from vllm.attention.ops.vit_attn_wrappers import ( + vit_flash_attn_wrapper, + vit_xformers_attn_wrapper, +) +from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.distributed import parallel_state +from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding.common import ( + dispatch_rotary_emb_function, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import ( + MultiModalDataDict, + MultiModalFieldConfig, + MultiModalKwargs, +) +from vllm.multimodal.parse import ( + ImageProcessorItems, + ImageSize, + MultiModalDataItems, +) +from vllm.multimodal.processing import ( + BaseMultiModalProcessor, + BaseProcessingInfo, + PromptReplacement, + PromptUpdate, +) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape + +from .ernie45 import Ernie4_5ForCausalLM +from .interfaces import MultiModalEmbeddings, SupportsMRoPE, SupportsMultiModal +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + WeightsMapper, + is_pp_missing_parameter, + maybe_prefix, +) +from .vision import get_vit_attn_backend + + +def smart_resize( + height: int, + width: int, + factor: int = 28, + min_pixels: int = 28 * 28 * 130, + max_pixels: int = 28 * 28 * 1280, +): + """Rescales the image so that the following conditions are met: + + 1. Both dimensions (height and width) are divisible by 'factor'. + + 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. + + 3. The aspect ratio of the image is maintained as closely as possible. + + """ + + if height < factor: + width = round((width * factor) / height) + height = factor + + if width < factor: + height = round((height * factor) / width) + width = factor + + if max(height, width) / min(height, width) > 200: + raise ValueError( + f"absolute aspect ratio must be smaller than 200, " + f"got {max(height, width) / min(height, width)}" + ) + h_bar = round(height / factor) * factor + w_bar = round(width / factor) * factor + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = math.floor(height / beta / factor) * factor + w_bar = math.floor(width / beta / factor) * factor + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = math.ceil(height * beta / factor) * factor + w_bar = math.ceil(width * beta / factor) * factor + return h_bar, w_bar + + +def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor: + if not interleaved: + x1, x2 = x.chunk(2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + x1, x2 = x[..., ::2], x[..., 1::2] + return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2) + + +def apply_rotary_emb_torch( + x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False +) -> torch.Tensor: + """ + x: (batch_size, seqlen, nheads, headdim) + cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2) + """ + ro_dim = cos.shape[-1] * 2 + assert ro_dim <= x.shape[-1] + cos = repeat( + cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)" + ) + sin = repeat( + sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)" + ) + return torch.cat( + [ + x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, + x[..., ro_dim:], + ], + dim=-1, + ) + + +def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: + rotary_emb_function = dispatch_rotary_emb_function(default=apply_rotary_emb_torch) + t_ = t.float() + cos = freqs.cos() + sin = freqs.sin() + output = rotary_emb_function(t_, cos, sin).type_as(t) + return output + + +class PaddleOCRVLProcessingInfo(BaseProcessingInfo): + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(**kwargs) + + def get_image_processor(self, **kwargs: object): + return self.get_hf_processor(**kwargs).image_processor + + def get_supported_mm_limits(self): + return {"image": None} + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + image_processor, + ) -> int: + if image_processor is None: + image_processor = self.get_image_processor() + + do_resize = True + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + merge_size = vision_config.spatial_merge_size + + if do_resize: + resized_height, resized_width = smart_resize( + height=image_height, + width=image_width, + factor=patch_size * merge_size, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + ) + preprocessed_size = ImageSize(width=resized_width, height=resized_height) + else: + preprocessed_size = ImageSize(width=image_width, height=image_height) + + grid_t = 1 + grid_h = preprocessed_size.height // patch_size + grid_w = preprocessed_size.width // patch_size + + num_patches = grid_t * grid_h * grid_w + num_image_tokens = num_patches // (merge_size**2) + + return num_image_tokens + + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + image_size = hf_config.vision_config.image_size + return ImageSize(height=image_size, width=image_size) + + +class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessingInfo]): + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + processor = self.info.get_hf_processor() + image_token = processor.image_token + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + max_image_size = self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + + return { + "image": self._get_dummy_images( + width=max_image_size.width, + height=max_image_size.height, + num_images=num_images, + overrides=image_overrides, + ) + } + + +class PaddleOCRVLMultiModalProcessor( + BaseMultiModalProcessor[PaddleOCRVLProcessingInfo] +): + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + if mm_data: + processed_outputs = self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(text=prompt, **mm_data), + dict(**mm_kwargs, **tok_kwargs), + ) + num_patches_per_image = processed_outputs["image_grid_thw"].prod(-1) + processed_outputs["pixel_values"] = processed_outputs["pixel_values"].split( + num_patches_per_image.tolist() + ) + else: + tokenizer = self.info.get_tokenizer() + processed_outputs = tokenizer( + prompt, add_special_tokens=True, return_tensors="pt" + ) + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_grid_thw=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) + hf_config = self.info.get_hf_config() + image_token_id = hf_config.image_token_id + + def get_replacement(item_idx: int, image_processor): + images = mm_items.get_items("image", ImageProcessorItems) + + image_size = images.get_image_size(item_idx) + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + image_processor=image_processor, + ) + + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=partial(get_replacement, image_processor=image_processor), + ), + ] + + +class Projector(nn.Module): + def __init__( + self, + text_config: PretrainedConfig, + vision_config: PretrainedConfig, + prefix: str = "", + ): + super().__init__() + self.text_config = text_config + self.vision_config = vision_config + self.merge_kernel_size = (2, 2) + + self.hidden_size = ( + self.vision_config.hidden_size + * self.merge_kernel_size[0] + * self.merge_kernel_size[1] + ) + + self.pre_norm = torch.nn.LayerNorm(self.vision_config.hidden_size, eps=1e-05) + self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True) + self.act = GELUActivation() + self.linear_2 = nn.Linear( + self.hidden_size, self.text_config.hidden_size, bias=True + ) + + def forward( + self, + image_features: torch.Tensor, + image_grid_thw: torch.Tensor, + ) -> torch.Tensor: + m1, m2 = self.merge_kernel_size + if isinstance(image_features, (list, tuple)): + processed_features = list() + for image_feature, image_grid in zip(image_features, image_grid_thw): + image_feature = self.pre_norm(image_feature) + t, h, w = image_grid + + image_feature = rearrange( + image_feature, + "(t h p1 w p2) d -> (t h w) (p1 p2 d)", + t=t, + h=h // m1, + p1=m1, + w=w // m2, + p2=m2, + ) + hidden_states = self.linear_1(image_feature) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + processed_features.append(hidden_states) + + return processed_features + + dims = image_features.shape[:-1] + dim = image_features.shape[-1] + image_features = image_features.view(np.prod(dims), dim) + hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size) + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + + return hidden_states.view(*dims, -1) + + +class PaddleOCRImagePixelInputs(TensorSchema): + type: Literal["pixel_values"] + pixel_values: Annotated[ + torch.Tensor, + TensorShape("bn", "p", 3, "patch_size", "patch_size", dynamic_dims={"p"}), + ] + image_grid_thw: Annotated[ + torch.Tensor, + TensorShape("bn", 3), + ] + + +class SiglipVisionEmbeddings(nn.Module): + def __init__(self, config: PretrainedConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + padding="valid", + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + self.cache_position_embedding = dict() + self.cache_position_count = dict() + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.packing_position_embedding = nn.Embedding(32768, self.embed_dim) + + self.register_buffer( + "position_ids", + torch.arange(self.num_positions).expand((1, -1)), + persistent=False, + ) + + def interpolate_pos_encoding( + self, + embeddings: torch.Tensor, + height: int, + width: int, + is_after_patchify: bool = False, + ) -> torch.Tensor: + num_positions = self.position_embedding.weight.shape[0] + + patch_pos_embed = self.position_embedding.weight.unsqueeze(0) + + dim = embeddings.shape[-1] + + if is_after_patchify: + new_height = height + new_width = width + else: + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape( + 1, sqrt_num_positions, sqrt_num_positions, dim + ) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bilinear", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return patch_pos_embed + + def fetch_position_embedding_lfu_cache( + self, embeddings: torch.Tensor, h: int, w: int, max_cache: int = 20 + ): + grid = (h, w) + if grid in self.cache_position_embedding: + self.cache_position_count[grid] += 1 + return self.cache_position_embedding[grid] + + if len(self.cache_position_embedding) >= max_cache: + min_hit_grid = min( + self.cache_position_count, + key=self.cache_position_count.get, + ) + self.cache_position_count.pop(min_hit_grid) + self.cache_position_embedding.pop(min_hit_grid) + + position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True) + self.cache_position_count[grid] = 1 + self.cache_position_embedding[grid] = position_embedding + return position_embedding + + def forward( + self, + pixel_values: torch.FloatTensor, + position_ids: torch.Tensor | None = None, + image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]] + | None = None, + interpolate_pos_encoding=False, + ) -> torch.Tensor: + if pixel_values.dim() == 4: + pixel_values = pixel_values.unsqueeze(0) + if pixel_values.dim() == 5: + if position_ids is None: + raise ValueError( + "position_ids cannot be None when pixel_values.dim() is 5." + ) + ( + batch_size, + squence_len, + channel, + height, + width, + ) = pixel_values.shape + target_dtype = self.patch_embedding.weight.dtype + pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w") + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) + embeddings = patch_embeds.flatten(-2).squeeze(-1) + + if interpolate_pos_encoding and image_grid_thw is not None: + start = 0 + tmp_embeddings = list() + for image_grid in image_grid_thw: + t, h, w = image_grid + end = start + t * h * w + image_embeddings = embeddings[start:end, :] + position_embedding = ( + self.interpolate_pos_encoding(image_embeddings, h, w, True) + .squeeze(0) + .repeat(t, 1) + ) + image_embeddings = image_embeddings + position_embedding + tmp_embeddings.append(image_embeddings) + start = end + embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0) + else: + embeddings = embeddings + self.packing_position_embedding(position_ids) + return embeddings + else: + raise ValueError( + "Unsupported pixel_values dimension:" + f" {pixel_values.dim()}. Expected 4 or 5." + ) + + +def all_gather_interleave(local_tensor: torch.Tensor, hidden_size: int, tp_size: int): + """All-gather the input tensor interleavely across model parallel group.""" + import torch.distributed as dist + + gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)] + dist.all_gather( + gathered_tensors, local_tensor, group=parallel_state.get_tp_group().device_group + ) + + gathered_tensors_split = [ + torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors + ] + ordered_tensors = [ + tensor for pair in zip(*gathered_tensors_split) for tensor in pair + ] + result_tensor = torch.cat(ordered_tensors, dim=-1) + return result_tensor + + +class SiglipAttention(nn.Module): + """SigLIP vision attention adapted from Qwen2.5-VisionAttention.""" + + def __init__( + self, + *, + embed_dim: int, + num_heads: int, + projection_size: int, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + attn_backend: _Backend = _Backend.TORCH_SDPA, + attn_backend_override: _Backend | None = None, + use_upstream_fa: bool = False, + ) -> None: + super().__init__() + + self.tp_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads + ) + self.num_attention_heads_per_partition = dist_utils.divide( + num_heads, self.tp_size + ) + + self.qkv_proj = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.hidden_size_per_attention_head, + total_num_heads=num_heads, + total_num_kv_heads=num_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.out_proj = RowParallelLinear( + input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + ) + + self.attn_backend = attn_backend + self.use_upstream_fa = use_upstream_fa + self.attn_backend, self.flash_attn_varlen_func = ( + maybe_get_vit_flash_attn_backend( + self.attn_backend, + self.use_upstream_fa, + attn_backend_override=attn_backend_override, + ) + ) + self.is_flash_attn_backend = self.attn_backend in { + _Backend.FLASH_ATTN, + _Backend.ROCM_AITER_FA, + } + + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + seq_len, bs, _ = qkv.shape + if self.tp_size > 1: + qkv = all_gather_interleave(qkv, self.qkv_proj.hidden_size, self.tp_size) + + q, k, v = qkv.chunk(3, dim=2) + + if self.tp_size > 1: + splitter = partial( + dist_utils.split_tensor_along_last_dim, num_partitions=self.tp_size + ) + q = splitter(q)[self.tp_rank] + k = splitter(k)[self.tp_rank] + v = splitter(v)[self.tp_rank] + + new_shape = ( + seq_len, + bs, + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + + def forward( + self, + hidden_states: torch.Tensor, + *, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor | None, + max_seqlen: torch.Tensor | None, + seqlens: torch.Tensor | None, + ) -> torch.Tensor: + batch_size, _, _ = hidden_states.shape + + x = rearrange(hidden_states, "b s d -> s b d") + x, _ = self.qkv_proj(x) + q, k, v = self.split_qkv(x) + q, k, v = (rearrange(t, "s b h d -> b s h d") for t in (q, k, v)) + + if rotary_pos_emb is not None: + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) + + if self.is_flash_attn_backend: + if max_seqlen is None: + raise ValueError("Flash attention backend requires max_seqlen.") + context_layer = vit_flash_attn_wrapper( + q, + k, + v, + cu_seqlens, + max_seqlen, + batch_size, + self.attn_backend == _Backend.ROCM_AITER_FA, + self.use_upstream_fa, + ) + elif self.attn_backend == _Backend.TORCH_SDPA: + outputs = [] + for i in range(1, len(cu_seqlens)): + start_idx = cu_seqlens[i - 1] + end_idx = cu_seqlens[i] + q_i = q[:, start_idx:end_idx] + k_i = k[:, start_idx:end_idx] + v_i = v[:, start_idx:end_idx] + q_i, k_i, v_i = ( + rearrange(tensor, "b s h d -> b h s d") + for tensor in (q_i, k_i, v_i) + ) + output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0) + output_i = rearrange(output_i, "b h s d -> b s h d") + outputs.append(output_i) + context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange( + context_layer, "b s h d -> s b (h d)" + ).contiguous() + elif self.attn_backend == _Backend.XFORMERS: + if seqlens is None: + raise ValueError("xFormers attention backend requires seqlens tensor.") + context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens) + else: + raise RuntimeError( + f"PaddleOCR-VL does not support {self.attn_backend} backend now." + ) + + output, _ = self.out_proj(context_layer) + output = rearrange(output, "s b d -> b s d") + return output + + +class SigLIPRotaryEmbedding(nn.Module): + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + self.dim = dim + self.theta = theta + self.rope_init() + + def rope_init(self): + inv_freq = 1.0 / ( + self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, seqlen: int) -> torch.Tensor: + seq = torch.arange( + seqlen, + device=self.inv_freq.device, + dtype=self.inv_freq.dtype, + ) + freqs = torch.outer(seq, self.inv_freq) + return freqs + + +class SiglipMLP(nn.Module): + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + + self.config = config + self.activation_fn = get_act_fn(config.hidden_act) + # Special handling for BNB and torchao quantization + if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]: + quantizable = True + else: + # For other quantization, we require the hidden size to be a + # multiple of 64 + quantizable = ( + config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0 + ) + self.fc1 = ColumnParallelLinear( + config.hidden_size, + config.intermediate_size, + quant_config=quant_config if quantizable else None, + prefix=f"{prefix}.fc1", + ) + self.fc2 = RowParallelLinear( + config.intermediate_size, + config.hidden_size, + quant_config=quant_config if quantizable else None, + prefix=f"{prefix}.fc2", + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + return hidden_states + + +class SiglipEncoderLayer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + *, + attn_backend: _Backend = _Backend.TORCH_SDPA, + attn_backend_override: _Backend | None = None, + use_upstream_fa: bool = False, + ): + super().__init__() + self.embed_dim = config.hidden_size + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.self_attn = SiglipAttention( + embed_dim=config.hidden_size, + num_heads=config.num_attention_heads, + projection_size=config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + attn_backend=attn_backend, + attn_backend_override=attn_backend_override, + use_upstream_fa=use_upstream_fa, + ) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = SiglipMLP( + config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + + def forward( + self, + hidden_states: torch.Tensor, + *, + cu_seqlens: torch.Tensor, + rotary_pos_emb: torch.Tensor | None, + max_seqlen: torch.Tensor | None, + seqlens: torch.Tensor | None, + ) -> torch.Tensor: + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states = self.self_attn( + hidden_states=hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = residual + hidden_states + + return hidden_states + + +class SiglipEncoder(nn.Module): + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + attn_backend_override: _Backend | None = None, + ): + super().__init__() + self.config = config + embed_dim = config.hidden_size + num_heads = config.num_attention_heads + head_dim = embed_dim // num_heads + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, + dtype=torch.get_default_dtype(), + attn_backend_override=attn_backend_override, + ) + self.use_upstream_fa = False + if self.attn_backend not in { + _Backend.FLASH_ATTN, + _Backend.ROCM_AITER_FA, + } and check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = _Backend.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { + _Backend.FLASH_ATTN, + _Backend.TORCH_SDPA, + _Backend.XFORMERS, + _Backend.ROCM_AITER_FA, + }: + raise RuntimeError( + f"PaddleOCR-VL does not support {self.attn_backend} backend now." + ) + self.layers = nn.ModuleList( + [ + SiglipEncoderLayer( + config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}", + attn_backend=self.attn_backend, + attn_backend_override=attn_backend_override, + use_upstream_fa=self.use_upstream_fa, + ) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2) + + @staticmethod + def flatten_list(image_grid_thw): + tmp_image_grid_thw = list() + for image_grid in image_grid_thw: + if isinstance(image_grid, list): + tmp_image_grid_thw.extend(image_grid) + else: + tmp_image_grid_thw.append(image_grid) + return tmp_image_grid_thw + + def forward( + self, + inputs_embeds, + cu_seqlens: torch.Tensor | None = None, + image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]] + | None = None, + height_position_ids: torch.Tensor | None = None, + width_position_ids: torch.Tensor | None = None, + ) -> torch.Tensor: + device = inputs_embeds.device + hidden_states = inputs_embeds + + flatten_image_grid_thw = self.flatten_list(image_grid_thw) + + if width_position_ids is None or height_position_ids is None: + split_hids = list() + split_wids = list() + for t, h, w in flatten_image_grid_thw: + image_pids = torch.arange(t * h * w, device=device) % (h * w) + sample_hids = image_pids // w + sample_wids = image_pids % w + split_hids.append(sample_hids) + split_wids.append(sample_wids) + width_position_ids = torch.concat(split_wids, dim=0) + height_position_ids = torch.concat(split_hids, dim=0) + + pids = torch.stack( + [height_position_ids, width_position_ids], + dim=-1, + ) + max_grid_size = pids.max() + 1 + rope_emb_max_grid = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rope_emb_max_grid[pids].flatten(1) + + if cu_seqlens is None: + raise ValueError("cu_seqlens cannot be None for SiglipEncoder.") + if not isinstance(cu_seqlens, torch.Tensor): + cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=device) + else: + cu_seqlens = cu_seqlens.to(device=device) + + max_seqlen = None + seqlens = None + if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] + + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer( + hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + return hidden_states + + +class SiglipVisionTransformer(nn.Module): + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + attn_backend_override: _Backend | None = None, + ): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = SiglipVisionEmbeddings(config) + self.encoder = SiglipEncoder( + config, + quant_config=quant_config, + prefix=f"{prefix}.encoder", + attn_backend_override=attn_backend_override, + ) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + pixel_values: torch.Tensor, + interpolate_pos_encoding: bool | None = False, + position_ids: torch.Tensor | None = None, + height_position_ids: torch.Tensor | None = None, + width_position_ids: torch.Tensor | None = None, + cu_seqlens: torch.Tensor | None = None, + image_grid_thw: torch.Tensor | None = None, + ) -> torch.Tensor: + hidden_states = self.embeddings( + pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + position_ids=position_ids, + image_grid_thw=image_grid_thw, + ) + + last_hidden_state = self.encoder( + inputs_embeds=hidden_states, + cu_seqlens=cu_seqlens, + image_grid_thw=image_grid_thw, + height_position_ids=height_position_ids, + width_position_ids=width_position_ids, + ) + + last_hidden_state = self.post_layernorm(last_hidden_state) + return last_hidden_state + + +class SiglipVisionModel(nn.Module): + def __init__( + self, + config, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + attn_backend_override: _Backend | None = None, + ): + super().__init__() + + self.vision_model = SiglipVisionTransformer( + config, + quant_config=quant_config, + prefix=f"{prefix}.vision_model", + attn_backend_override=attn_backend_override, + ) + self.quant_config = quant_config + + @property + def dtype(self) -> torch.dtype: + return self.vision_model.embeddings.patch_embedding.weight.dtype + + @property + def device(self) -> torch.device: + return self.vision_model.embeddings.patch_embedding.weight.device + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + def forward( + self, + pixel_values, + interpolate_pos_encoding: bool = False, + position_ids: torch.Tensor | None = None, + image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]] + | None = None, + cu_seqlens: torch.Tensor | None = None, + ) -> BaseModelOutputWithPooling: + return self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + position_ids=position_ids, + image_grid_thw=image_grid_thw, + cu_seqlens=cu_seqlens, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if "head.attention" in name or "head.layernorm" in name: + continue + if "head.mlp" in name or "head.probe" in name: + continue + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + param = params_dict[scale_name] + weight_loader = getattr( + param, + "weight_loader", + default_weight_loader, + ) + loaded_weight = ( + loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] + ) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for ( + param_name, + weight_name, + shard_id, + ) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if name.endswith(".bias") and name not in params_dict: + continue + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr( + param, + "weight_loader", + default_weight_loader, + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +@MULTIMODAL_REGISTRY.register_processor( + PaddleOCRVLMultiModalProcessor, + info=PaddleOCRVLProcessingInfo, + dummy_inputs=PaddleOCRVLDummyInputsBuilder, +) +class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsMRoPE): + merge_by_field_config = True + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.": "language_model.model.", + "lm_head.": "language_model.lm_head.", + } + ) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + attn_backend_override = ( + multimodal_config.mm_encoder_attn_backend + if multimodal_config is not None + else None + ) + + self.visual = SiglipVisionModel( + config=config.vision_config, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + attn_backend_override=attn_backend_override, + ) + self.mlp_AR = Projector(config, config.vision_config) + + self.language_model = Ernie4_5ForCausalLM( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model"), + ) + + for layer in self.language_model.model.layers: + if not isinstance(layer, PPMissingLayer): + layer.self_attn.rotary_emb.is_neox_style = True + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + return self.language_model.compute_logits(hidden_states) + + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: list[list[int]] | torch.Tensor, + video_grid_thw: list[list[int]] | torch.Tensor, + second_per_grid_ts: list[float], + context_len: int = 0, + seq_len: int | None = None, + audio_feature_lengths: torch.Tensor | None = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + tokens_per_second = getattr(hf_config.vision_config, "tokens_per_second", 1.0) + + input_tokens_tensor = torch.tensor(input_tokens) + vision_start_indices = torch.argwhere( + input_tokens_tensor == vision_start_token_id + ).squeeze(1) + vision_tokens = input_tokens_tensor[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_videos = image_nums, video_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + video_nums): + video_second_per_grid_t = 0.0 + if remain_images > 0: + try: + ed_image = input_tokens.index(image_token_id, st) + except ValueError: + ed_image = len(input_tokens) + 1 + else: + ed_image = len(input_tokens) + 1 + if remain_videos > 0: + try: + ed_video = input_tokens.index(video_token_id, st) + except ValueError: + ed_video = len(input_tokens) + 1 + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_second_per_grid_t = 1.0 + if second_per_grid_ts: + video_second_per_grid_t = second_per_grid_ts[video_index] + video_index += 1 + remain_videos -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + t_index = ( + ( + torch.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + * video_second_per_grid_t + * tokens_per_second + ) + .long() + .flatten() + ) + + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx + ) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta + + def get_language_model(self) -> nn.Module: + return self.language_model + + def _parse_and_validate_image_input( + self, **kwargs: object + ) -> PaddleOCRImagePixelInputs | None: + pixel_values = kwargs.pop("pixel_values", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None: + return None + + return PaddleOCRImagePixelInputs( + type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + ) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ): + if intermediate_tensors is not None: + inputs_embeds = None + + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + is_multimodal = kwargs.pop("is_multimodal", None) + handle_oov_mm_token = kwargs.pop("handle_oov_mm_token", False) + inputs_embeds = self.get_input_embeddings( + input_ids, + vision_embeddings, + is_multimodal=is_multimodal, + handle_oov_mm_token=handle_oov_mm_token, + ) + input_ids = None + + return self.language_model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> str | None: + if modality.startswith("image"): + return "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" + + raise ValueError("Only image modality is supported") + + def encode_image( + self, pixel_values: torch.Tensor, image_grid_thw: torch.Tensor + ) -> torch.Tensor: + pixel_values = pixel_values.type(self.visual.dtype) + siglip_position_ids = list() + image_grid_hws = list() + cu_seqlens = [0] + + thw_tuple = tuple(image_grid_thw.tolist()) + numel = np.prod(thw_tuple) + image_grid_hws.append(thw_tuple) + image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:]) + siglip_position_ids.append(image_position_ids) + cu_seqlens.append(cu_seqlens[-1] + numel) + + siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to( + pixel_values.device + ) + cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(pixel_values.device) + + vision_outputs = self.visual( + pixel_values=pixel_values, + image_grid_thw=image_grid_hws, + position_ids=siglip_position_ids, + interpolate_pos_encoding=True, + cu_seqlens=cu_seqlens, + ) + return vision_outputs + + def _process_image_input( + self, image_input: PaddleOCRImagePixelInputs + ) -> MultiModalEmbeddings: + pixel_values = image_input.pixel_values + image_grid_thw = image_input.image_grid_thw + vision_outputs = tuple( + self.encode_image(pixel, grid).squeeze(0) + for pixel, grid in zip(pixel_values, image_grid_thw) + ) + image_embeds = self.mlp_AR(vision_outputs, image_grid_thw) + return image_embeds + + def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return () + + multimodal_embeddings: tuple[torch.Tensor, ...] = () + image_embeds = self._process_image_input(image_input) + multimodal_embeddings += tuple(image_embeds) + + return multimodal_embeddings + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + autoloaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + return autoloaded_weights diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 7eca1a09e5365..d9299697fcb03 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -340,6 +340,10 @@ _MULTIMODAL_MODELS = { "NVLM_D": ("nvlm_d", "NVLM_D_Model"), "Ovis": ("ovis", "Ovis"), "Ovis2_5": ("ovis2_5", "Ovis2_5"), + "PaddleOCRVLForConditionalGeneration": ( + "paddleocr_vl", + "PaddleOCRVLForConditionalGeneration", + ), "PaliGemmaForConditionalGeneration": ( "paligemma", "PaliGemmaForConditionalGeneration", From 294c805f1df9ddf62c2290989710da9d48ab4973 Mon Sep 17 00:00:00 2001 From: gnovack Date: Mon, 3 Nov 2025 04:22:17 -0800 Subject: [PATCH 092/976] Early exit for MoE LoRA kernels (#27131) Signed-off-by: gnovack Co-authored-by: Jee Jee Li --- csrc/moe/moe_lora_align_sum_kernels.cu | 27 ++++++---- csrc/moe/moe_ops.h | 15 +++--- csrc/moe/torch_bindings.cpp | 4 +- tests/lora/test_fused_moe_lora_kernel.py | 6 +++ tests/lora/test_moe_lora_align_sum.py | 4 ++ tests/lora/test_olmoe_tp.py | 50 ++++++++++++++++--- vllm/_custom_ops.py | 4 ++ vllm/lora/layers/fused_moe.py | 11 +++- vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 25 ++++++++-- vllm/lora/punica_wrapper/punica_base.py | 2 + vllm/lora/punica_wrapper/punica_gpu.py | 9 +++- 11 files changed, 123 insertions(+), 34 deletions(-) diff --git a/csrc/moe/moe_lora_align_sum_kernels.cu b/csrc/moe/moe_lora_align_sum_kernels.cu index e76d1c3667853..360f1312cf579 100644 --- a/csrc/moe/moe_lora_align_sum_kernels.cu +++ b/csrc/moe/moe_lora_align_sum_kernels.cu @@ -28,11 +28,16 @@ __global__ void moe_lora_align_sum_kernel( int64_t block_size, int num_experts, int max_loras, size_t numel, int max_num_tokens_padded, int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, - int topk_num, int32_t* total_tokens_post_pad) { + int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled, + int32_t* lora_ids) { const size_t tokens_per_thread = div_ceil(numel, blockDim.x); const size_t start_idx = threadIdx.x * tokens_per_thread; - int lora_id = blockIdx.x; + int lora_idx = blockIdx.x; + int lora_id = lora_ids[lora_idx]; + if (lora_id == -1 || adapter_enabled[lora_id] == 0) { + return; + } extern __shared__ int32_t shared_mem[]; int32_t* cumsum = shared_mem; token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1); @@ -121,14 +126,13 @@ __global__ void moe_lora_align_sum_kernel( } } -void moe_lora_align_block_size(torch::Tensor topk_ids, - torch::Tensor token_lora_mapping, - int64_t num_experts, int64_t block_size, - int64_t max_loras, int64_t max_num_tokens_padded, - int64_t max_num_m_blocks, - torch::Tensor sorted_token_ids, - torch::Tensor expert_ids, - torch::Tensor num_tokens_post_pad) { +void moe_lora_align_block_size( + torch::Tensor topk_ids, torch::Tensor token_lora_mapping, + int64_t num_experts, int64_t block_size, int64_t max_loras, + int64_t max_num_tokens_padded, int64_t max_num_m_blocks, + torch::Tensor sorted_token_ids, torch::Tensor expert_ids, + torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled, + torch::Tensor lora_ids) { const int topk_num = topk_ids.size(1); TORCH_CHECK(block_size > 0, "block_size should be greater than 0. "); @@ -164,6 +168,7 @@ void moe_lora_align_block_size(torch::Tensor topk_ids, max_loras, topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks, sorted_token_ids.data_ptr(), expert_ids.data_ptr(), topk_num, - num_tokens_post_pad.data_ptr()); + num_tokens_post_pad.data_ptr(), + adapter_enabled.data_ptr(), lora_ids.data_ptr()); }); } \ No newline at end of file diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index e4bf0aa99421b..0adf745689b2f 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -20,14 +20,13 @@ void batched_moe_align_block_size(int64_t max_tokens_per_batch, torch::Tensor expert_ids, torch::Tensor num_tokens_post_pad); -void moe_lora_align_block_size(torch::Tensor topk_ids, - torch::Tensor token_lora_mapping, - int64_t num_experts, int64_t block_size, - int64_t max_loras, int64_t max_num_tokens_padded, - int64_t max_num_m_blocks, - torch::Tensor sorted_token_ids, - torch::Tensor expert_ids, - torch::Tensor num_tokens_post_pad); +void moe_lora_align_block_size( + torch::Tensor topk_ids, torch::Tensor token_lora_mapping, + int64_t num_experts, int64_t block_size, int64_t max_loras, + int64_t max_num_tokens_padded, int64_t max_num_m_blocks, + torch::Tensor sorted_token_ids, torch::Tensor expert_ids, + torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled, + torch::Tensor lora_ids); #ifndef USE_ROCM torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor b_qweight, torch::Tensor b_scales, diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index c08a543908ef0..ace72fad71e86 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -44,7 +44,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { " int max_num_m_blocks, " " Tensor !sorted_token_ids," " Tensor !experts_ids," - " Tensor !num_tokens_post_pad) -> () "); + " Tensor !num_tokens_post_pad," + " Tensor !adapter_enabled," + " Tensor !lora_ids) -> () "); m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size); #ifndef USE_ROCM diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py index b724e112b9dd3..318a0e58805d3 100644 --- a/tests/lora/test_fused_moe_lora_kernel.py +++ b/tests/lora/test_fused_moe_lora_kernel.py @@ -134,6 +134,8 @@ def use_fused_moe_lora_kernel( ) expert_ids = torch.empty((max_loras * max_num_m_blocks,), dtype=torch.int32) num_tokens_post_padded = torch.empty((max_loras,), dtype=torch.int32) + adapter_enabled = torch.ones(max_loras + 1, dtype=torch.int32) + lora_ids = torch.arange(max_loras + 2, dtype=torch.int32) # call kernel ops.moe_lora_align_block_size( @@ -147,6 +149,8 @@ def use_fused_moe_lora_kernel( sorted_token_ids, expert_ids, num_tokens_post_padded, + adapter_enabled, + lora_ids, ) config = { @@ -172,6 +176,8 @@ def use_fused_moe_lora_kernel( num_tokens_post_padded, max_lora_rank, top_k_num, + lora_ids, + adapter_enabled, config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], config["BLOCK_SIZE_K"], diff --git a/tests/lora/test_moe_lora_align_sum.py b/tests/lora/test_moe_lora_align_sum.py index 6cd1281c36328..72f1d759f1e7a 100644 --- a/tests/lora/test_moe_lora_align_sum.py +++ b/tests/lora/test_moe_lora_align_sum.py @@ -60,6 +60,8 @@ def test_moe_lora_align_block_size( (max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda" ) num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda") + adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda") + lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda") # call kernel ops.moe_lora_align_block_size( @@ -73,6 +75,8 @@ def test_moe_lora_align_block_size( sorted_token_ids, expert_ids, num_tokens_post_pad, + adapter_enabled, + lora_ids, ) # verify values diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py index b954e0776ca4a..e659c1e1a9a07 100644 --- a/tests/lora/test_olmoe_tp.py +++ b/tests/lora/test_olmoe_tp.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + import vllm from vllm.lora.request import LoRARequest @@ -28,8 +29,17 @@ EXPECTED_LORA_OUTPUT = [ "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501 ] +EXPECTED_BASE_MODEL_OUTPUT = [ + "SELECT COUNT(Candidate_ID) FROM candidate", + "SELECT COUNT(Candidate_ID) FROM candidate", + "SELECT Candidate_ID, COUNT(*) as Total_Candidates\nFROM candidate\nINNER JOIN people ON candidate.People_ID = people.People_ID", # noqa: E501 + "SELECT Candidate_ID, Poll_Source FROM candidate WHERE People_ID IN (SELECT People_ID FROM people) ORDER BY COUNT(*) DESC LIMIT 1", # noqa: E501 +] -def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None: + +def generate_and_test( + llm: vllm.LLM, lora_path: str, lora_id: list[int | None] | int | None +) -> None: prompts = [ PROMPT_TEMPLATE.format(context="How many candidates are there?"), PROMPT_TEMPLATE.format(context="Count the number of candidates."), @@ -40,12 +50,18 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None: context="Return the poll resource associated with the most candidates." ), ] + + lora_request = None + if isinstance(lora_id, int): + lora_request = LoRARequest(str(lora_id), lora_id, lora_path) + elif isinstance(lora_id, list): + lora_request = [ + LoRARequest(str(i), i, lora_path) if i is not None else None + for i in lora_id + ] + sampling_params = vllm.SamplingParams(temperature=0, max_tokens=64) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None, - ) + outputs = llm.generate(prompts, sampling_params, lora_request=lora_request) # Print the outputs. generated_texts: list[str] = [] for output in outputs: @@ -55,7 +71,13 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") for i in range(len(EXPECTED_LORA_OUTPUT)): - assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i]) + req_lora_id = lora_id[i] if isinstance(lora_id, list) else lora_id + expected_output = ( + EXPECTED_LORA_OUTPUT[i] + if req_lora_id is not None + else EXPECTED_BASE_MODEL_OUTPUT[i] + ) + assert generated_texts[i].startswith(expected_output) def test_olmoe_lora(olmoe_lora_files): @@ -75,6 +97,20 @@ def test_olmoe_lora(olmoe_lora_files): generate_and_test(llm, olmoe_lora_files, lora_id=2) +def test_olmoe_lora_mixed(olmoe_lora_files): + llm = vllm.LLM( + MODEL_PATH, + max_model_len=1024, + enable_lora=True, + max_loras=4, + enforce_eager=True, + trust_remote_code=True, + enable_chunked_prefill=True, + ) + + generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None]) + + @multi_gpu_test(num_gpus=2) def test_olmoe_lora_tp2(olmoe_lora_files): llm = vllm.LLM( diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 61cf54fcfa39a..657b11046809d 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1823,6 +1823,8 @@ def moe_lora_align_block_size( sorted_token_ids: torch.Tensor, experts_ids: torch.Tensor, num_tokens_post_pad: torch.Tensor, + adapter_enabled: torch.Tensor, + lora_ids: torch.Tensor, ) -> None: torch.ops._moe_C.moe_lora_align_block_size( topk_ids, @@ -1835,6 +1837,8 @@ def moe_lora_align_block_size( sorted_token_ids, experts_ids, num_tokens_post_pad, + adapter_enabled, + lora_ids, ) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 275a2ed0c6813..7711f5c3208bc 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -111,6 +111,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): config["BLOCK_SIZE_M"], self.base_layer.local_num_experts, max_loras, + self.adapter_enabled, expert_map, ) @@ -138,6 +139,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): max_lora_rank, top_k, config, + self.adapter_enabled, ) result = func(*args, **kwargs) @@ -196,6 +198,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): max_lora_rank, top_k, config, + self.adapter_enabled, True, ) @@ -227,6 +230,10 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): ) -> None: """Initializes lora matrices.""" + self.adapter_enabled = torch.tensor( + [0] * (max_loras + 1), dtype=torch.int, device=self.device + ) + self.w1_lora_a_stacked = torch.zeros( ( max_loras, @@ -313,6 +320,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.w3_lora_b_stacked[index] = 0 self.w2_lora_a_stacked[index] = 0 self.w2_lora_b_stacked[index] = 0 + self.adapter_enabled[index] = 0 def set_lora( self, @@ -322,8 +330,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): embeddings_tensor: torch.Tensor | None, bias: torch.Tensor | None = None, ): - self.reset_lora(index) """Overwrites lora tensors at index.""" + self.reset_lora(index) + self.adapter_enabled[index] = 1 for eid in range(len(lora_a) // 3): w1_lora_a = lora_a[eid * 3] w2_lora_a = lora_a[eid * 3 + 1] diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index 15031f5e2f9e8..539605c7c534a 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -54,6 +54,8 @@ def _fused_moe_lora_kernel( EM, num_valid_tokens, num_experts, + lora_ids, + adapter_enabled, # The stride variables represent how much to increase the ptr by when # moving by 1 element in a particular dimension. E.g. `stride_am` is # how much to increase `a_ptr` by to get the element one row down @@ -84,6 +86,11 @@ def _fused_moe_lora_kernel( pid = tl.program_id(axis=0) slice_id = tl.program_id(axis=1) lora_idx = tl.program_id(axis=2) + lora_id = tl.load(lora_ids + lora_idx) + moe_enabled = tl.load(adapter_enabled + lora_id) + if lora_id == -1 or moe_enabled == 0: + # Early exit for the no-lora case. + return max_loras = tl.num_programs(axis=2) grid_k = tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K) @@ -100,12 +107,12 @@ def _fused_moe_lora_kernel( pid_m = first_pid_m + ((pid_m_n % num_pid_in_group) % group_size_m) pid_n = (pid_m_n % num_pid_in_group) // group_size_m - num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr + lora_idx) + num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr + lora_id) if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: return # get the expert_id to process curr shard - ind = lora_idx * stride_el + pid_m + ind = lora_id * stride_el + pid_m expert_id = tl.load(expert_ids_ptr + ind, ind < max_loras * stride_el, -1) if expert_id == -1: return @@ -119,7 +126,7 @@ def _fused_moe_lora_kernel( offs_k = pid_sk * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) - token_ind = stride_tl * lora_idx + offs_token_id + token_ind = stride_tl * lora_id + offs_token_id offs_token = tl.load( sorted_token_ids_ptr + token_ind, token_ind < max_loras * stride_tl, 0 ) @@ -132,7 +139,7 @@ def _fused_moe_lora_kernel( b_ptrs = ( cur_b_ptr - + lora_idx * stride_bl + + lora_id * stride_bl + expert_id * stride_be + offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn @@ -184,6 +191,8 @@ def _fused_moe_lora( num_tokens_post_padded: torch.Tensor, # (max_loras, ) max_lora_rank: int, top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, block_size_m: int, block_size_n: int, block_size_k: int, @@ -234,7 +243,7 @@ def _fused_moe_lora( num_tokens = M * top_k_num w1_output_dim_size = w1_lora_b_stacked.shape[2] - lora_intermediate_cache1 = torch.empty( + lora_intermediate_cache1 = torch.zeros( (num_slices * M * top_k_num * (max_lora_rank + w1_output_dim_size)), dtype=output.dtype, device=device, @@ -272,6 +281,8 @@ def _fused_moe_lora( EM, num_tokens, num_experts, + lora_ids, + adapter_enabled, qcurr_hidden_states.stride(0), qcurr_hidden_states.stride(1), w1_lora_a_stacked.stride(0), @@ -319,6 +330,8 @@ def _fused_moe_lora( EM, num_tokens, num_experts, + lora_ids, + adapter_enabled, a_intermediate_cache1.stride(0), a_intermediate_cache1.stride(1), w1_lora_b_stacked.stride(0), @@ -352,6 +365,8 @@ def _fused_moe_lora_fake( num_tokens_post_padded: torch.Tensor, max_lora_rank: int, top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, block_size_m: int, block_size_n: int, block_size_k: int, diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index 5b4a18cf4789b..c552412cfd62e 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -456,6 +456,7 @@ class PunicaWrapperBase(PunicaWrapperABC): block_size: int, num_experts: int, max_loras: int, + adapter_enabled: torch.Tensor, expert_map: torch.Tensor | None = None, pad_sorted_ids: bool = False, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -479,6 +480,7 @@ class PunicaWrapperBase(PunicaWrapperABC): max_lora_rank: int, top_k_num: int, config, + adapter_enabled: torch.Tensor, mul_routed_weight=False, ): """ diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index d9590769778ea..30def90380db1 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -305,6 +305,7 @@ class PunicaWrapperGPU(PunicaWrapperBase): block_size: int, num_experts: int, max_loras: int, + adapter_enabled: torch.Tensor, expert_map: torch.Tensor | None = None, pad_sorted_ids: bool = False, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -331,7 +332,7 @@ class PunicaWrapperGPU(PunicaWrapperBase): (max_loras), dtype=torch.int32, device=topk_ids.device ) - (token_lora_mapping, _, _, _, _, _) = self.token_mapping_meta.meta_args( + (token_lora_mapping, _, _, _, lora_ids, _) = self.token_mapping_meta.meta_args( num_tokens ) @@ -346,6 +347,8 @@ class PunicaWrapperGPU(PunicaWrapperBase): sorted_ids, expert_ids, num_tokens_post_pad, + adapter_enabled, + lora_ids, ) if expert_map is not None: expert_ids = expert_map[expert_ids] @@ -365,11 +368,13 @@ class PunicaWrapperGPU(PunicaWrapperBase): max_lora_rank: int, top_k_num: int, config, + adapter_enabled: torch.Tensor, mul_routed_weight=False, ): """ Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer. """ + (_, _, _, _, lora_ids, _) = self.token_mapping_meta.meta_args(x.size(0)) fused_moe_lora( y, x, @@ -381,6 +386,8 @@ class PunicaWrapperGPU(PunicaWrapperBase): num_tokens_post_padded, max_lora_rank, top_k_num, + lora_ids, + adapter_enabled, config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"], config["BLOCK_SIZE_K"], From f7d2946e996f656b5f831fe2003f3b95a91fb367 Mon Sep 17 00:00:00 2001 From: pwschuurman Date: Mon, 3 Nov 2025 06:31:03 -0800 Subject: [PATCH 093/976] [Bugfix] Skip gs:// model paths for speculator detection (#27846) Signed-off-by: Peter Schuurman --- tests/transformers_utils/test_utils.py | 26 ++++++++++++++++++++++++++ vllm/engine/arg_utils.py | 10 +++++----- vllm/transformers_utils/utils.py | 8 ++++++++ 3 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 tests/transformers_utils/test_utils.py diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py new file mode 100644 index 0000000000000..beaef04d766bf --- /dev/null +++ b/tests/transformers_utils/test_utils.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from vllm.transformers_utils.utils import is_cloud_storage, is_gcs, is_s3 + + +def test_is_gcs(): + assert is_gcs("gs://model-path") + assert not is_gcs("s3://model-path/path-to-model") + assert not is_gcs("/unix/local/path") + assert not is_gcs("nfs://nfs-fqdn.local") + + +def test_is_s3(): + assert is_s3("s3://model-path/path-to-model") + assert not is_s3("gs://model-path") + assert not is_s3("/unix/local/path") + assert not is_s3("nfs://nfs-fqdn.local") + + +def test_is_cloud_storage(): + assert is_cloud_storage("gs://model-path") + assert is_cloud_storage("s3://model-path/path-to-model") + assert not is_cloud_storage("/unix/local/path") + assert not is_cloud_storage("nfs://nfs-fqdn.local") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 66c75d944ec8b..14fd4e70ad6c0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -86,7 +86,7 @@ from vllm.transformers_utils.config import ( is_interleaved, maybe_override_with_speculators, ) -from vllm.transformers_utils.utils import check_gguf_file, is_s3 +from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GiB_bytes from vllm.utils.network_utils import get_ip @@ -1310,10 +1310,10 @@ class EngineArgs: # Check if the model is a speculator and override model/tokenizer/config # BEFORE creating ModelConfig, so the config is created with the target model - # Skip speculator detection for S3 models since HuggingFace cannot load - # configs directly from S3 URLs. S3 models can still use speculators with - # explicit --speculative-config. - if not is_s3(self.model): + # Skip speculator detection for cloud storage models (eg: S3, GCS) since + # HuggingFace cannot load configs directly from S3 URLs. S3 models can still + # use speculators with explicit --speculative-config. + if not is_cloud_storage(self.model): (self.model, self.tokenizer, self.speculative_config) = ( maybe_override_with_speculators( model=self.model, diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index af2df195f2958..1ae42ba622dc4 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -19,6 +19,14 @@ def is_s3(model_or_path: str) -> bool: return model_or_path.lower().startswith("s3://") +def is_gcs(model_or_path: str) -> bool: + return model_or_path.lower().startswith("gs://") + + +def is_cloud_storage(model_or_path: str) -> bool: + return is_s3(model_or_path) or is_gcs(model_or_path) + + def check_gguf_file(model: str | PathLike) -> bool: """Check if the file is a GGUF model.""" model = Path(model) From cac4c10ef0e3280f045bff32cbb05e9a56e41b1b Mon Sep 17 00:00:00 2001 From: ahao-anyscale Date: Mon, 3 Nov 2025 08:13:51 -0800 Subject: [PATCH 094/976] [BUG] Make 'binary' default option for saving torch compile artifacts when using standalone_compile (#27616) Signed-off-by: ahao-anyscale --- docs/design/torch_compile.md | 2 ++ vllm/compilation/backends.py | 4 +++- vllm/compilation/compiler_interface.py | 9 ++++++--- vllm/config/compilation.py | 23 ++++++++++++++++++++++- vllm/envs.py | 10 ++++++++++ 5 files changed, 43 insertions(+), 5 deletions(-) diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md index 5a3ca2de82194..27edc4f89201d 100644 --- a/docs/design/torch_compile.md +++ b/docs/design/torch_compile.md @@ -27,6 +27,8 @@ With all these factors taken into consideration, usually we can guarantee that t A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all the compilation finishes before we serve any requests. No requests will trigger new compilations. Otherwise, the engine would be blocked on that request, and the response time will have unexpected spikes. +By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`. + ## Python Code Compilation In the very verbose logs, we can see: diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 53fd5e74dc0a8..83d8cdae1ed34 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -51,7 +51,9 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: and hasattr(torch._inductor, "standalone_compile") ): logger.debug("Using InductorStandaloneAdaptor") - return InductorStandaloneAdaptor() + return InductorStandaloneAdaptor( + compilation_config.compile_cache_save_format + ) else: logger.debug("Using InductorAdaptor") return InductorAdaptor() diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 0a3f0769db941..d15481b3045d6 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -6,7 +6,7 @@ import hashlib import os from collections.abc import Callable from contextlib import ExitStack -from typing import Any +from typing import Any, Literal from unittest.mock import patch import torch @@ -175,6 +175,9 @@ class InductorStandaloneAdaptor(CompilerInterface): name = "inductor_standalone" + def __init__(self, save_format: Literal["binary", "unpacked"]): + self.save_format = save_format + def compute_hash(self, vllm_config: VllmConfig) -> str: factors = get_inductor_factors() hash_str = hashlib.md5( @@ -220,7 +223,7 @@ class InductorStandaloneAdaptor(CompilerInterface): assert key is not None path = os.path.join(self.cache_dir, key) if not envs.VLLM_DISABLE_COMPILE_CACHE: - compiled_graph.save(path=path, format="unpacked") + compiled_graph.save(path=path, format=self.save_format) compilation_counter.num_compiled_artifacts_saved += 1 return compiled_graph, (key, path) @@ -237,7 +240,7 @@ class InductorStandaloneAdaptor(CompilerInterface): assert isinstance(handle[1], str) path = handle[1] inductor_compiled_graph = torch._inductor.CompiledArtifact.load( - path=path, format="unpacked" + path=path, format=self.save_format ) from torch._inductor.compile_fx import graph_returns_tuple diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 6a5bd5ef4e07c..00e8cbfd7319a 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -7,11 +7,12 @@ from collections import Counter from collections.abc import Callable from dataclasses import asdict, field from pathlib import Path -from typing import TYPE_CHECKING, Any, ClassVar +from typing import TYPE_CHECKING, Any, ClassVar, Literal from pydantic import TypeAdapter, field_validator from pydantic.dataclasses import dataclass +import vllm.envs as envs from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.config.utils import config from vllm.logger import init_logger @@ -208,6 +209,15 @@ class CompilationConfig: """The directory to store the compiled graph, to accelerate Inductor compilation. By default, it will use model-related information to generate a cache directory.""" + compile_cache_save_format: Literal["binary", "unpacked"] = field( + default_factory=lambda: envs.VLLM_COMPILE_CACHE_SAVE_FORMAT + ) + """Format for saving torch compile cache:\n + - "binary": saves as binary file (multiprocess safe)\n + - "unpacked": saves as directory structure for inspection/debugging + (NOT multiprocess safe)\n + Defaults to `VLLM_COMPILE_CACHE_SAVE_FORMAT` if not specified. + """ backend: str = "" """The backend for compilation. It needs to be a string: @@ -479,6 +489,7 @@ class CompilationConfig: factors.append(self.inductor_compile_config) factors.append(self.inductor_passes) factors.append(self.pass_config.uuid()) + factors.append(self.compile_cache_save_format) return hashlib.sha256(str(factors).encode()).hexdigest() def __repr__(self) -> str: @@ -520,6 +531,16 @@ class CompilationConfig: return CUDAGraphMode[value.upper()] return value + @field_validator("compile_cache_save_format") + @classmethod + def validate_compile_cache_save_format(cls, value: str) -> str: + if value not in ("binary", "unpacked"): + raise ValueError( + f"compile_cache_save_format must be 'binary' or 'unpacked', " + f"got: {value}" + ) + return value + def __post_init__(self) -> None: if self.level is not None: logger.warning( diff --git a/vllm/envs.py b/vllm/envs.py index 21237c70a45e4..81f189ada9a6f 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -218,6 +218,7 @@ if TYPE_CHECKING: VLLM_USE_FBGEMM: bool = False VLLM_GC_DEBUG: str = "" VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False + VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary" def get_default_cache_root(): @@ -1442,6 +1443,15 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: os.getenv( "VLLM_DISABLE_SHARED_EXPERTS_STREAM", False ), + # Format for saving torch.compile cache artifacts + # - "binary": saves as binary file + # Safe for multiple vllm serve processes accessing the same torch compile cache. + # - "unpacked": saves as directory structure (for inspection/debugging) + # NOT multiprocess safe - race conditions may occur with multiple processes. + # Allows viewing and setting breakpoints in Inductor's code output files. + "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices( + "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"] + ), } # --8<-- [end:env-vars-definition] From 4bc400f47e33ef27fb69608b9ad7fe992cb8ba76 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 4 Nov 2025 02:00:46 +0900 Subject: [PATCH 095/976] [CI/Testing] Add basic single node dual batch overlap test (#27235) Signed-off-by: Lucas Wilkinson --- .buildkite/test-pipeline.yaml | 2 + tests/v1/distributed/test_dbo.py | 89 ++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 tests/v1/distributed/test_dbo.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a020b0d276be0..07e2bf09d8aa0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1223,6 +1223,7 @@ steps: - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### - label: Distributed Tests (B200) # optional @@ -1233,6 +1234,7 @@ steps: commands: - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py ##### RL Integration Tests ##### - label: Prime-RL Integration Test # 15min diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py new file mode 100644 index 0000000000000..866ae742bf3c0 --- /dev/null +++ b/tests/v1/distributed/test_dbo.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test Dual Batch Overlap (DBO) with Data Parallelism + Expert Parallelism. + +DBO is specifically designed for DP+EP scenarios to hide communication latency +by overlapping computation of two batches. This test validates that DBO works +correctly with the DeepSeek-V2-Lite model using GSM8K evaluation. +""" + +import pytest + +from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k +from tests.utils import RemoteOpenAIServer + +MODEL_NAME = "deepseek-ai/DeepSeek-V2-Lite-Chat" +DP_SIZE = 2 + +# GSM8K eval configuration +NUM_QUESTIONS = 256 # Fast eval for CI; but must be large enough to hit dbo thresholds +NUM_SHOTS = 5 # Few-shot examples +MIN_ACCURACY = 0.62 # Expected 0.64 with 2% buffer (based on vLLM test data) + +# Increase max_num_seqs to trigger DBO for decode batches +# With 64 seqs, decode batches should exceed the 32 token threshold +MAX_NUM_SEQS = 64 # Increased from 16 to trigger decode DBO + +# DeepEP backends to test +DEEPEP_BACKENDS = [ + "deepep_low_latency", + "deepep_high_throughput", +] + + +@pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS) +def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available): + """ + Test DBO with DP+EP using GSM8K evaluation. + """ + required_gpus = DP_SIZE + + if num_gpus_available < required_gpus: + pytest.skip(f"Need at least {required_gpus} GPUs (DP={DP_SIZE})") + + # Server arguments for DBO + DP + EP + server_args = [ + "--max-model-len", + "4096", + "--max-num-seqs", + str(MAX_NUM_SEQS), # Use larger batch to trigger decode DBO + "--trust-remote-code", + # Note: Not using --enforce-eager to test DBO's alternate CUDA graph dispatching + "--data-parallel-size", + str(DP_SIZE), + "--enable-expert-parallel", + "--enable-dbo", + # Fix threshold so we know we trigger DBO + "--dbo-decode-token-threshold", + "16", + "--dbo-prefill-token-threshold", + "256", + "--all2all-backend", + all2all_backend, + ] + + with RemoteOpenAIServer( + MODEL_NAME, + server_args, + max_wait_seconds=600, # Allow time for model loading with DP+EP + ) as remote_server: + # Use host and port directly from RemoteOpenAIServer + host = f"http://{remote_server.host}" + port = remote_server.port + + # Run GSM8K evaluation + results = evaluate_gsm8k( + num_questions=NUM_QUESTIONS, + num_shots=NUM_SHOTS, + host=host, + port=port, + ) + + # Validate accuracy is reasonable + accuracy = results["accuracy"] + assert accuracy >= MIN_ACCURACY, ( + f"DBO+DP+EP accuracy too low ({all2all_backend}): " + f"{accuracy:.3f} < {MIN_ACCURACY:.3f} " + f"(correct: {results['num_correct']}/{results['num_questions']})" + ) From 2c19d96777939dd3473eabfacbe1948a3ea0b4be Mon Sep 17 00:00:00 2001 From: Aurick Qiao Date: Mon, 3 Nov 2025 09:23:31 -0800 Subject: [PATCH 096/976] [Spec Decode] Integrate Suffix Decoding from Arctic Inference (#25784) Co-authored-by: Aurick Qiao --- docs/features/spec_decode.md | 40 ++++++++++ requirements/test.in | 1 + requirements/test.txt | 2 + tests/v1/e2e/test_spec_decode.py | 85 +++++++++++++++++++-- vllm/config/speculative.py | 66 +++++++++++++++- vllm/utils/import_utils.py | 6 ++ vllm/v1/spec_decode/suffix_decoding.py | 101 +++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 14 +++- 8 files changed, 304 insertions(+), 11 deletions(-) create mode 100644 vllm/v1/spec_decode/suffix_decoding.py diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md index ab72c7d97b7a4..6097500cac01f 100644 --- a/docs/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -130,6 +130,46 @@ matching n-grams in the prompt. For more information read [this thread.](https:/ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +## Speculating using Suffix Decoding + +The following code configures vLLM to use speculative decoding where proposals are generated using Suffix Decoding ([technical report](https://arxiv.org/abs/2411.04975)). + +Like n-gram, Suffix Decoding can generate draft tokens by pattern-matching using the last `n` generated tokens. Unlike n-gram, Suffix Decoding (1) can pattern-match against both the prompt and previous generations, (2) uses frequency counts to propose the most likely continuations, and (3) speculates an adaptive number of tokens for each request at each iteration to get better acceptance rates. + +Suffix Decoding can achieve better performance for tasks with high repetition, such as code-editing, agentic loops (e.g. self-reflection, self-consistency), and RL rollouts. + +!!! tip "Install Arctic Inference" + Suffix Decoding requires [Arctic Inference](https://github.com/snowflakedb/ArcticInference). You can install it with `pip install arctic-inference`. + +!!! tip "Suffix Decoding Speculative Tokens" + Suffix Decoding will speculate a dynamic number of tokens for each request at each decoding step, so the `num_speculative_tokens` configuration specifies the *maximum* number of speculative tokens. It is suggested to use a high number such as `16` or `32` (default). + +??? code + + ```python + from vllm import LLM, SamplingParams + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_config={ + "method": "suffix", + "num_speculative_tokens": 32, + }, + ) + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` + ## Speculating using MLP speculators The following code configures vLLM to use speculative decoding where proposals are generated by diff --git a/requirements/test.in b/requirements/test.in index f57ec31277ce9..ce209fd276628 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -48,6 +48,7 @@ buildkite-test-collector==0.1.9 genai_perf==0.0.8 tritonclient==2.51.0 +arctic-inference == 0.1.0 # Required for suffix decoding test numba == 0.61.2 # Required for N-gram speculative decoding numpy runai-model-streamer[s3,gcs]==0.15.0 diff --git a/requirements/test.txt b/requirements/test.txt index a975f247065da..9d13fa4241152 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -40,6 +40,8 @@ anyio==4.6.2.post1 # via # httpx # starlette +arctic-inference==0.1.0 + # via -r requirements/test.in argcomplete==3.5.1 # via datamodel-code-generator arrow==1.3.0 diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index ea7fcdf3174ec..9b55d2b14b991 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -75,7 +75,23 @@ def model_name(): return "meta-llama/Llama-3.1-8B-Instruct" -def test_ngram_correctness( +@pytest.mark.parametrize( + "speculative_config", + [ + { + "method": "ngram", + "prompt_lookup_max": 5, + "prompt_lookup_min": 3, + "num_speculative_tokens": 3, + }, + { + "method": "suffix", + "suffix_decoding_max_spec_factor": 2.0, + }, + ], +) +def test_ngram_and_suffix_correctness( + speculative_config: dict, monkeypatch: pytest.MonkeyPatch, sampling_config: SamplingParams, model_name: str, @@ -94,12 +110,7 @@ def test_ngram_correctness( spec_llm = LLM( model=model_name, - speculative_config={ - "method": "ngram", - "prompt_lookup_max": 5, - "prompt_lookup_min": 3, - "num_speculative_tokens": 3, - }, + speculative_config=speculative_config, max_model_len=1024, ) spec_outputs = spec_llm.chat(test_prompts, sampling_config) @@ -121,6 +132,66 @@ def test_ngram_correctness( cleanup_dist_env_and_memory() +def test_suffix_decoding_acceptance( + monkeypatch: pytest.MonkeyPatch, + sampling_config: SamplingParams, + model_name: str, +): + """ + Check that suffix decoding caching takes effect and improves acceptance + lengths and acceptance rates over multiple runs of the same prompts. + """ + test_prompts = get_test_prompts(mm_enabled=False) + + spec_llm = LLM( + model=model_name, + speculative_config={ + "method": "suffix", + "suffix_decoding_max_spec_factor": 2.0, + "suffix_decoding_max_cached_requests": 1000, + }, + max_model_len=1024, + disable_log_stats=False, + ) + + # Run several times and check that the accepted tokens increase. + spec_llm.chat(test_prompts, sampling_config) + num_draft = [] + num_accept = [] + for i in range(10): # Run multiple times to warm up the cache. + spec_llm.chat(test_prompts, sampling_config) + # Collect draft and acceptance stats. + metrics = spec_llm.get_metrics() + for metric in metrics: + if metric.name == "vllm:spec_decode_num_draft_tokens": + num_draft.append(metric.value) + if metric.name == "vllm:spec_decode_num_accepted_tokens": + num_accept.append(metric.value) + + # Calculate the acceptance rates for the first and last runs. + first_accept_tokens = num_accept[0] + first_draft_tokens = num_draft[0] + first_accept_rate = first_accept_tokens / first_draft_tokens + + # Take the diff since the stats are cumulative. + last_accept_tokens = num_accept[-1] - num_accept[-2] + last_draft_tokens = num_draft[-1] - num_draft[-2] + last_accept_rate = last_accept_tokens / last_draft_tokens + + # Expect the acceptance length to improve. + assert first_accept_tokens < last_accept_tokens + + # Expect the acceptance rate to improve. + assert first_accept_rate < last_accept_rate + + # Heuristic: expect at least 85% acceptance rate at the end. + assert last_accept_rate > 0.85 + + del spec_llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() + + @pytest.mark.parametrize( "model_path", [ diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 1f956526dcdc6..af1d640f8accc 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -12,7 +12,7 @@ from typing_extensions import Self from vllm.config.parallel import ParallelConfig from vllm.config.utils import config from vllm.logger import init_logger -from vllm.utils.import_utils import LazyLoader +from vllm.utils.import_utils import LazyLoader, has_arctic_inference if TYPE_CHECKING: from transformers import PretrainedConfig @@ -42,6 +42,7 @@ SpeculativeMethod = Literal[ "mimo_mtp", "longcat_flash_mtp", "mtp", + "suffix", ] MTP_MODEL_TYPES = ( "deepseek_mtp", @@ -129,6 +130,27 @@ class SpeculativeConfig: draft_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore """The parallel configuration for the draft model initialized internal.""" + # Suffix decoding configuration + suffix_decoding_max_tree_depth: int = 24 + """The maximum depth of the suffix decoding global and prompt trees. The + tree depth limits the sum of the prefix match and speculation lengths.""" + + suffix_decoding_max_cached_requests: int = 10000 + """The maximum number of requests to cache in the global suffix tree. If + exceeded, will trigger eviction in FIFO order. If set to 0, the global + suffix tree is disabled and past responses are not cached (prompt trees + are still used).""" + + suffix_decoding_max_spec_factor: float = 1.0 + """The maximum spec factor for suffix decoding. The spec factor controls + speculation lengths based on the prefix match length: max_spec_tokens = + max_spec_factor * prefix_match_length.""" + + suffix_decoding_min_token_prob: float = 0.1 + """The minimum token probability for suffix decoding. Will only speculate + tokens with estimated probability (based on frequency counts) greater than + or equal to this value.""" + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, @@ -235,6 +257,8 @@ class SpeculativeConfig: self.quantization = self.target_model_config.quantization elif self.method in ("ngram", "[ngram]"): self.model = "ngram" + elif self.method == "suffix": + self.model = "suffix" else: raise ValueError( "num_speculative_tokens was provided but without speculative model." @@ -282,6 +306,8 @@ class SpeculativeConfig: # draft related config as None here. self.draft_model_config = self.target_model_config self.draft_parallel_config = self.target_parallel_config + elif self.method == "suffix": + self._validate_suffix_decoding() else: self.prompt_lookup_max = 0 self.prompt_lookup_min = 0 @@ -430,6 +456,42 @@ class SpeculativeConfig: ) return self + def _validate_suffix_decoding(self): + if not has_arctic_inference(): + raise ImportError( + "Arctic Inference is required for suffix decoding. " + "Install via `pip install arctic-inference==0.1.0`." + ) + if self.num_speculative_tokens is None: + # Suffix decoding decides the actual number of speculative tokens + # dynamically and treats num_speculative_tokens as a maximum limit. + self.num_speculative_tokens = self.suffix_decoding_max_tree_depth + logger.warning( + "Defaulted num_speculative_tokens to %s for suffix decoding.", + self.num_speculative_tokens, + ) + # Validate values + if self.suffix_decoding_max_tree_depth < 1: + raise ValueError( + f"suffix_decoding_max_tree_depth=" + f"{self.suffix_decoding_max_tree_depth} must be >= 1" + ) + if self.suffix_decoding_max_cached_requests < 0: + raise ValueError( + f"suffix_decoding_max_cached_requests=" + f"{self.suffix_decoding_max_cached_requests} must be >= 0" + ) + if self.suffix_decoding_max_spec_factor < 0: + raise ValueError( + f"suffix_decoding_max_spec_factor=" + f"{self.suffix_decoding_max_spec_factor} must be >= 0" + ) + if not 0 <= self.suffix_decoding_min_token_prob <= 1: + raise ValueError( + f"suffix_decoding_min_token_prob=" + f"{self.suffix_decoding_min_token_prob} must be in [0, 1]" + ) + @staticmethod def _maybe_override_draft_max_model_len( speculative_max_model_len: int | None, @@ -582,6 +644,6 @@ class SpeculativeConfig: def __repr__(self) -> str: method = self.method - model = None if method == "ngram" else self.draft_model_config.model + model = None if method in ("ngram", "suffix") else self.draft_model_config.model num_spec_tokens = self.num_speculative_tokens return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})" diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py index 409a5a6cd302d..f01d2c7a6a33d 100644 --- a/vllm/utils/import_utils.py +++ b/vllm/utils/import_utils.py @@ -403,3 +403,9 @@ def has_triton_kernels() -> bool: def has_tilelang() -> bool: """Whether the optional `tilelang` package is available.""" return _has_module("tilelang") + + +def has_arctic_inference() -> bool: + """Whether the optional `arctic_inference` package is available.""" + + return _has_module("arctic_inference") diff --git a/vllm/v1/spec_decode/suffix_decoding.py b/vllm/v1/spec_decode/suffix_decoding.py new file mode 100644 index 0000000000000..049e335db3254 --- /dev/null +++ b/vllm/v1/spec_decode/suffix_decoding.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.config import VllmConfig +from vllm.v1.worker.gpu_input_batch import InputBatch + + +class SuffixDecodingProposer: + """ + Speculative decoding proposer for Suffix Decoding (https://arxiv.org/pdf/2411.04975). + This class imports and uses the official implementation from Arctic Inference + (https://github.com/snowflakedb/ArcticInference). + """ + + def __init__(self, vllm_config: VllmConfig): + config = vllm_config.speculative_config + self.num_speculative_tokens = config.num_speculative_tokens + self.max_tree_depth = config.suffix_decoding_max_tree_depth + self.max_spec_factor = config.suffix_decoding_max_spec_factor + self.min_token_prob = config.suffix_decoding_min_token_prob + self.max_model_len = vllm_config.model_config.max_model_len + + # Lazy import to avoid error when Suffix Decoding is not used. + from arctic_inference.suffix_decoding import SuffixDecodingCache + + # Initialize and empty cache. This object will take care of caching request + # outputs, evicting old requests, and manages the per-prompt suffix trees. + self.suffix_cache = SuffixDecodingCache( + max_tree_depth=config.suffix_decoding_max_tree_depth, + max_cached_requests=config.suffix_decoding_max_cached_requests, + ) + + def propose( + self, + input_batch: InputBatch, + sampled_token_ids: list[list[int]], + ) -> list[list[int]]: + """ + Propose speculative tokens for each request in the input batch. Suffix Decoding + will speculate a dynamic number of tokens for each request every decoding step, + so each entry in the returned list may have different lengths. + """ + draft_token_ids: list[list[int]] = [] + for i, sampled_ids in enumerate(sampled_token_ids): + if not sampled_ids: + # Skip speculative decoding for partial prefills. + draft_token_ids.append([]) + continue + + # Skip requests that require sampling parameters that are not + # supported with speculative decoding. + req_id = input_batch.req_ids[i] + if req_id in input_batch.spec_decode_unsupported_reqs: + draft_token_ids.append([]) + continue + + num_tokens = input_batch.num_tokens_no_spec[i] + if num_tokens >= self.max_model_len: + # Skip requests that have already reached the max model length. + draft_token_ids.append([]) + continue + + index = input_batch.req_id_to_index[req_id] + if req_id not in self.suffix_cache.active_requests: + if req_id in self.suffix_cache.cached_requests: + # Reset the suffix cache for this request. + self.suffix_cache.evict_cached_response(req_id) + num_prompt_tokens = input_batch.num_prompt_tokens[index] + prompt_token_ids = input_batch.token_ids_cpu[index, :num_prompt_tokens] + # Start a new request, this will build the suffix tree for that prompt. + self.suffix_cache.start_request(req_id, prompt_token_ids) + + # Append the newly sampled ids to the suffix cache for this request. + self.suffix_cache.add_active_response(req_id, sampled_ids) + + # Suffix decoding only uses the most recent tokens up to max_tree_depth, so + # we extract the pattern from the end of the input. + start = max(0, num_tokens - self.max_tree_depth) + pattern = input_batch.token_ids_cpu[i, start:num_tokens] + draft = self.suffix_cache.speculate( + req_id, + pattern, + max_spec_tokens=min( + self.num_speculative_tokens, self.max_model_len - num_tokens - 1 + ), + max_spec_factor=self.max_spec_factor, + min_token_prob=self.min_token_prob, + ) + + draft_token_ids.append(draft.token_ids) + + # Stop requests that were not seen in the input batch. + for req_id in ( + self.suffix_cache.active_requests - input_batch.req_id_to_index.keys() + ): + self.suffix_cache.stop_request(req_id) + + return draft_token_ids + + def load_model(self, *args, **kwargs): + # No model to load. + pass diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9212221bb6009..e700c09038e28 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -125,6 +125,7 @@ from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.spec_decode.medusa import MedusaProposer from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer +from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext from vllm.v1.worker.dp_utils import coordinate_batch_across_dp @@ -336,16 +337,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # the last PP rank. This is not ideal if there are many # layers in the draft model. if self.speculative_config and get_pp_group().is_last_rank: + self.drafter: ( + NgramProposer | SuffixDecodingProposer | EagleProposer | MedusaProposer + ) if self.speculative_config.method == "ngram": self.drafter = NgramProposer(self.vllm_config) + elif self.speculative_config.method == "suffix": + self.drafter = SuffixDecodingProposer(self.vllm_config) elif self.speculative_config.use_eagle(): - self.drafter = EagleProposer(self.vllm_config, self.device, self) # type: ignore + self.drafter = EagleProposer(self.vllm_config, self.device, self) if self.speculative_config.method == "eagle3": self.use_aux_hidden_state_outputs = True elif self.speculative_config.method == "medusa": self.drafter = MedusaProposer( vllm_config=self.vllm_config, device=self.device - ) # type: ignore + ) else: raise ValueError( "Unknown speculative decoding method: " @@ -2783,6 +2789,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.input_batch.token_ids_cpu, self.input_batch.spec_decode_unsupported_reqs, ) + elif self.speculative_config.method == "suffix": + assert isinstance(sampled_token_ids, list) + assert isinstance(self.drafter, SuffixDecodingProposer) + draft_token_ids = self.drafter.propose(self.input_batch, sampled_token_ids) elif self.speculative_config.method == "medusa": assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, MedusaProposer) From a4398fbb5e9fe20c8f0f092da4de30c9a582cce0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sophie=20du=20Cou=C3=A9dic?= Date: Mon, 3 Nov 2025 19:33:17 +0100 Subject: [PATCH 097/976] [Feature][Benchmarks] Support `inf` burstiness (#26941) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sophie du Couédic --- vllm/benchmarks/serve.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 4b15d8e62913c..b8f44966db7a0 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -189,9 +189,16 @@ async def get_request( total_requests, request_rate, ) + assert current_request_rate > 0.0, ( + f"Obtained non-positive request rate {current_request_rate}." + ) request_rates.append(current_request_rate) if current_request_rate == float("inf"): delay_ts.append(0) + elif burstiness == float("inf"): + # when burstiness tends to infinity, the delay time becomes constant + # and tends to the inverse of the request rate + delay_ts.append(1.0 / current_request_rate) else: theta = 1.0 / (current_request_rate * burstiness) From 55011aef24c2838b05df585822b8fc231eea19b2 Mon Sep 17 00:00:00 2001 From: Lucas Kabela Date: Mon, 3 Nov 2025 11:12:15 -0800 Subject: [PATCH 098/976] [Bugfix][Qwen][Multimodal] Move Qwen2_5_vl sdpa to custom op and reenable compile (#27764) Signed-off-by: Lucas Kabela --- vllm/attention/ops/vit_attn_wrappers.py | 53 ++++++++++++++++++++++++ vllm/model_executor/models/qwen2_5_vl.py | 44 +++++++------------- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py index 6cefe74416685..06a9f7cd82266 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/attention/ops/vit_attn_wrappers.py @@ -14,6 +14,7 @@ To use these ops, you must have a recent version of PyTorch installed (>= 2.4.0) import einops import torch +import torch.nn.functional as F from vllm.utils.torch_utils import direct_register_custom_op @@ -123,3 +124,55 @@ def vit_flash_attn_wrapper( return torch.ops.vllm.flash_attn_maxseqlen_wrapper( q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa ) + + +# TODO: Once we have a torch 2.10, we can use tensor slices +# so we won't need to wrap this in custom ops +def torch_sdpa_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, +) -> torch.Tensor: + outputs = [] + for i in range(1, len(cu_seqlens)): + start_idx = cu_seqlens[i - 1] + end_idx = cu_seqlens[i] + q_i = q[:, start_idx:end_idx] + k_i = k[:, start_idx:end_idx] + v_i = v[:, start_idx:end_idx] + q_i, k_i, v_i = ( + einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] + ) + output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0) + output_i = einops.rearrange(output_i, "b h s d -> b s h d ") + outputs.append(output_i) + context_layer = torch.cat(outputs, dim=1) + context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous() + return context_layer + + +def torch_sdpa_wrapper_fake( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, +) -> torch.Tensor: + b, s, h, d = q.shape + return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device) + + +direct_register_custom_op( + op_name="torch_sdpa_wrapper", + op_func=torch_sdpa_wrapper, + fake_impl=torch_sdpa_wrapper_fake, +) + + +def vit_torch_sdpa_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + cu_seqlens: torch.Tensor, +) -> torch.Tensor: + return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, cu_seqlens) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 3585783e4ccc3..2b04608dfd03f 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -46,6 +46,7 @@ from vllm.attention.backends.registry import _Backend from vllm.attention.layer import maybe_get_vit_flash_attn_backend from vllm.attention.ops.vit_attn_wrappers import ( vit_flash_attn_wrapper, + vit_torch_sdpa_wrapper, vit_xformers_attn_wrapper, ) from vllm.compilation.decorators import support_torch_compile @@ -442,23 +443,12 @@ class Qwen2_5_VisionAttention(nn.Module): q = q.contiguous() k = k.contiguous() v = v.contiguous() - outputs = [] - for i in range(1, len(cu_seqlens)): - start_idx = cu_seqlens[i - 1] - end_idx = cu_seqlens[i] - q_i = q[:, start_idx:end_idx] - k_i = k[:, start_idx:end_idx] - v_i = v[:, start_idx:end_idx] - q_i, k_i, v_i = ( - einops.rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i] - ) - output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0) - output_i = einops.rearrange(output_i, "b h s d -> b s h d ") - outputs.append(output_i) - context_layer = torch.cat(outputs, dim=1) - context_layer = einops.rearrange( - context_layer, "b s h d -> s b (h d)" - ).contiguous() + context_layer = vit_torch_sdpa_wrapper( + q, + k, + v, + cu_seqlens, + ) elif self.attn_backend == _Backend.XFORMERS: context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens) @@ -466,17 +456,15 @@ class Qwen2_5_VisionAttention(nn.Module): return output -# (FIXME): Enable this after dynamic slicing is fixed -# See https://github.com/vllm-project/vllm/pull/27760 -# @support_torch_compile( -# dynamic_arg_dims={ -# "x": 0, -# "cu_seqlens": 0, -# "rotary_pos_emb": 0, -# "seqlens": 0, -# }, -# mark_unbacked_dims={"seqlens": 0}, -# ) +@support_torch_compile( + dynamic_arg_dims={ + "x": 0, + "cu_seqlens": 0, + "rotary_pos_emb": 0, + "seqlens": 0, + }, + mark_unbacked_dims={"seqlens": 0}, +) class Qwen2_5_VisionBlock(nn.Module): def __init__( self, From 145c00a4d32b7a681f7fb936c9575812c7aa7880 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 3 Nov 2025 15:17:10 -0500 Subject: [PATCH 099/976] [Bugfix] change FlashMLA reorder_batch_threshold (#27777) Signed-off-by: Matthew Bonanni --- vllm/v1/attention/backends/mla/flashmla.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 1f98204031ed5..bc17307532093 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -71,7 +71,7 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]): class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM - reorder_batch_threshold: int = 512 # process small prefills with decode pathway + reorder_batch_threshold: int = 128 # process small prefills with decode pathway # ^ TODO(matt): tune this def __init__( From 786030721efb2b85a582d65f9bb5d7197de06f83 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Tue, 4 Nov 2025 04:35:16 +0800 Subject: [PATCH 100/976] [Docs] add runai_streamer_sharded to LoadConfig (#27937) Signed-off-by: Andy Xie --- vllm/config/load.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/config/load.py b/vllm/config/load.py index d625c1ac987e7..e424f8c5edb62 100644 --- a/vllm/config/load.py +++ b/vllm/config/load.py @@ -40,6 +40,8 @@ class LoadConfig: more information.\n - "runai_streamer" will load the Safetensors weights using Run:ai Model Streamer.\n + - "runai_streamer_sharded" will load weights from pre-sharded checkpoint + files using Run:ai Model Streamer.\n - "bitsandbytes" will load the weights using bitsandbytes quantization.\n - "sharded_state" will load weights from pre-sharded checkpoint files, supporting efficient loading of tensor-parallel models.\n From 01baefe674e61d156672d14b11b20055252df662 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 3 Nov 2025 16:04:40 -0500 Subject: [PATCH 101/976] Add TP parameter to attention tests (#27683) Signed-off-by: Matthew Bonanni --- .buildkite/test-pipeline.yaml | 3 +- tests/v1/attention/test_attention_backends.py | 58 +++++++++++++++++-- tests/v1/attention/test_mla_backends.py | 31 +++++++++- .../v1/attention/test_sparse_mla_backends.py | 11 +++- 4 files changed, 92 insertions(+), 11 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 07e2bf09d8aa0..4a898df8f2a34 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -347,8 +347,7 @@ steps: - vllm/v1/attention - tests/v1/attention commands: - - export VLLM_DISABLE_FLASHINFER_PREFILL=1 # TODO: FI prefill is bugged and causes incorrectness, fix this - - pytest -v -s v1/attention + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this - label: V1 Test others (CPU) # 5 mins source_file_dependencies: diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 6659b3eb1e98f..08aeb6f298f61 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -295,6 +295,7 @@ def _test_backend_correctness( block_size: int = 16, atol: float = 1e-2, rtol: float = 1e-2, + tensor_parallel_size: int = 1, ): """ Test that all backends produce similar outputs to a reference implementation @@ -310,13 +311,38 @@ def _test_backend_correctness( 4. Running each vLLM attention backend with the new queries and the simulated paged KV cache. 5. Comparing the vLLM backend's output to the ground-truth SDPA output. + + Note: When tensor_parallel_size > 1, we simulate the head partitioning + by overriding the model config to use fewer heads, without requiring + multiple GPUs. This tests that backends work correctly with different + head counts. """ current_platform.seed_everything(42) + + hf_config_override = None + if tensor_parallel_size > 1: + from vllm.config import ModelConfig + + temp_config = ModelConfig(model=model, max_model_len=1) + original_num_heads = temp_config.hf_text_config.num_attention_heads + original_num_kv_heads = getattr( + temp_config.hf_text_config, "num_key_value_heads", None + ) + hf_config_override = { + "num_attention_heads": original_num_heads // tensor_parallel_size, + } + if original_num_kv_heads is not None: + hf_config_override["num_key_value_heads"] = max( + 1, original_num_kv_heads // tensor_parallel_size + ) + vllm_config = create_vllm_config( model_name=model, + tensor_parallel_size=1, # Always use TP=1 to avoid multi-GPU requirements max_model_len=max(batch_spec.seq_lens), block_size=block_size, num_gpu_blocks=8192, + hf_config_override=hf_config_override, ) device = torch.device("cuda:0") @@ -503,7 +529,10 @@ def _test_backend_correctness( ], ) @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"]) -def test_causal_backend_correctness(batch_spec_name: str, model: str): +@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4]) +def test_causal_backend_correctness( + batch_spec_name: str, model: str, tensor_parallel_size: int +): """Test backend's correctness with causal attention.""" def causal_mask_mod( @@ -523,12 +552,23 @@ def test_causal_backend_correctness(batch_spec_name: str, model: str): SMALL_BLOCK_BACKENDS = [ x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS ] - _test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS, causal_mask_mod) + _test_backend_correctness( + batch_spec, + model, + SMALL_BLOCK_BACKENDS, + causal_mask_mod, + tensor_parallel_size=tensor_parallel_size, + ) # Fast FlexAttention needs to run with block_size=128 if LARGE_BLOCK_BACKENDS: _test_backend_correctness( - batch_spec, model, LARGE_BLOCK_BACKENDS, causal_mask_mod, block_size=128 + batch_spec, + model, + LARGE_BLOCK_BACKENDS, + causal_mask_mod, + block_size=128, + tensor_parallel_size=tensor_parallel_size, ) @@ -545,7 +585,10 @@ SLIDING_WINDOW_BACKENDS_TO_TEST = [ ["small_decode", "small_prefill", "mixed_medium", "large_decode", "large_prefill"], ) @pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"]) -def test_sliding_window_backend_correctness(batch_spec_name: str, model: str): +@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4]) +def test_sliding_window_backend_correctness( + batch_spec_name: str, model: str, tensor_parallel_size: int +): """Test backend's correctness with sliding window attention.""" def sliding_window_mask_mod( @@ -575,7 +618,11 @@ def test_sliding_window_backend_correctness(batch_spec_name: str, model: str): x for x in SLIDING_WINDOW_BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS ] _test_backend_correctness( - batch_spec, model, SMALL_BLOCK_BACKENDS, sliding_window_mask_mod_fn + batch_spec, + model, + SMALL_BLOCK_BACKENDS, + sliding_window_mask_mod_fn, + tensor_parallel_size=tensor_parallel_size, ) # Fast FlexAttention needs to run with block_size=128 @@ -586,4 +633,5 @@ def test_sliding_window_backend_correctness(batch_spec_name: str, model: str): LARGE_BLOCK_BACKENDS, sliding_window_mask_mod_fn, block_size=128, + tensor_parallel_size=tensor_parallel_size, ) diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index cda4fb11c096e..5679fafe63ee8 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -394,8 +394,11 @@ def run_attention_backend( "spec_decode_medium", ], ) -@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-V2-Lite-Chat"]) -def test_backend_correctness(dist_init, batch_spec_name: str, model: str): +@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-R1"]) +@pytest.mark.parametrize("tensor_parallel_size", [1, 4, 8, 16]) +def test_backend_correctness( + dist_init, batch_spec_name: str, model: str, tensor_parallel_size: int +): """ Test that all backends produce similar outputs to a reference implementation using torch.nn.functional.scaled_dot_product_attention. @@ -410,6 +413,11 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): 4. Running each vLLM attention backend with the new queries and the simulated paged KV cache. 5. Comparing the vLLM backend's output to the ground-truth SDPA output. + + Note: When tensor_parallel_size > 1, we simulate the head partitioning + by overriding the model config to use fewer heads, without requiring + multiple GPUs. This tests that backends work correctly with different + head counts. """ batch_spec = BATCH_SPECS[batch_spec_name] @@ -423,11 +431,30 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): # Add 1 for null block at index 0, and some buffer num_gpu_blocks = required_blocks + 1 + 100 + hf_config_override = None + if tensor_parallel_size > 1: + from vllm.config import ModelConfig + + temp_config = ModelConfig(model=model, max_model_len=1) + original_num_heads = temp_config.hf_text_config.num_attention_heads + original_num_kv_heads = getattr( + temp_config.hf_text_config, "num_key_value_heads", None + ) + hf_config_override = { + "num_attention_heads": original_num_heads // tensor_parallel_size, + } + if original_num_kv_heads is not None: + hf_config_override["num_key_value_heads"] = max( + 1, original_num_kv_heads // tensor_parallel_size + ) + vllm_config = create_vllm_config( model_name=model, + tensor_parallel_size=1, # Always use TP=1 to avoid multi-GPU requirements max_model_len=max(batch_spec.seq_lens), num_gpu_blocks=num_gpu_blocks, block_size=default_block_size, + hf_config_override=hf_config_override, ) # For spec decode tests, add a speculative_config to set the reorder_batch_threshold diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py index 02324d2aca6ef..b34d587eb362d 100644 --- a/tests/v1/attention/test_sparse_mla_backends.py +++ b/tests/v1/attention/test_sparse_mla_backends.py @@ -113,7 +113,10 @@ def _quantize_dequantize_fp8_ds_mla( @pytest.mark.parametrize("batch_name", list(SPARSE_BACKEND_BATCH_SPECS.keys())) @pytest.mark.parametrize("kv_cache_dtype", ["fp8_ds_mla", "auto"]) -def test_sparse_backend_decode_correctness(dist_init, batch_name, kv_cache_dtype): +@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4]) +def test_sparse_backend_decode_correctness( + dist_init, batch_name, kv_cache_dtype, tensor_parallel_size +): if not torch.cuda.is_available(): pytest.skip("CUDA is required for sparse MLA decode test") @@ -135,8 +138,11 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name, kv_cache_dtype total_cache_tokens = sum(batch_spec.seq_lens) block_size = 64 + # Note: We use TP=1 to avoid multi-GPU requirements in CI. + # The test simulates head partitioning via mocked methods below. vllm_config = create_vllm_config( model_name="deepseek-ai/DeepSeek-V2-Lite-Chat", + tensor_parallel_size=1, max_model_len=max_seqlen, num_gpu_blocks=max(2048, cdiv(total_cache_tokens, block_size) + 1), block_size=block_size, @@ -156,7 +162,8 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name, kv_cache_dtype ) model_config.dtype = dtype model_config.get_num_attention_heads = MethodType( - lambda self, parallel_config: num_heads, model_config + lambda self, parallel_config: max(1, num_heads // tensor_parallel_size), + model_config, ) model_config.get_num_kv_heads = MethodType( lambda self, parallel_config: 1, model_config From ccd3e55e51d44bf3a17b2203a304c9609aa5dfe2 Mon Sep 17 00:00:00 2001 From: Hank_ <37239608+ILikeIneine@users.noreply.github.com> Date: Tue, 4 Nov 2025 05:27:03 +0800 Subject: [PATCH 102/976] [Bugfix][plugin] fla crash on plugin (#27322) --- vllm/model_executor/layers/fla/ops/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fla/ops/utils.py b/vllm/model_executor/layers/fla/ops/utils.py index 3a503981a8734..5a48e56a5fbbf 100644 --- a/vllm/model_executor/layers/fla/ops/utils.py +++ b/vllm/model_executor/layers/fla/ops/utils.py @@ -17,6 +17,7 @@ from typing import Any, Literal import torch +from vllm.platforms import current_platform from vllm.triton_utils import triton logger = logging.getLogger(__name__) @@ -137,8 +138,8 @@ def _check_platform() -> Literal["nvidia", "amd", "intel", "musa"]: # For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'. # However, the torch backend is 'cuda' for both Nvidia and AMD GPUs. # Therefore, we need to check the triton backend to determine the actual GPU vendor. -device = get_available_device() if get_available_device() != "hip" else "cuda" -device_torch_lib = getattr(torch, device) +device = "cuda" if current_platform.is_cuda_alike() else get_available_device() +device_torch_lib = getattr(torch, device, None) device_platform = _check_platform() is_amd = device_platform == "amd" From 3758757377b713b6acc997d0ac2c5dd49c332278 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Mon, 3 Nov 2025 17:26:49 -0500 Subject: [PATCH 103/976] [Bugfix] Fix MoE Routing Simulation (#28002) Signed-off-by: Tyler Michael Smith --- vllm/model_executor/layers/fused_moe/layer.py | 2 +- .../layers/fused_moe/routing_simulator.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 46d351b48c5e8..55aa2593193ab 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -2066,7 +2066,7 @@ class FusedMoE(CustomOp): ) # DeepSeekv2 uses grouped_top_k - if use_grouped_topk: + elif use_grouped_topk: assert topk_group is not None assert num_expert_group is not None if is_rocm_aiter_moe_enabled(): diff --git a/vllm/model_executor/layers/fused_moe/routing_simulator.py b/vllm/model_executor/layers/fused_moe/routing_simulator.py index 8b04cf4539e04..a01cdc4908b93 100644 --- a/vllm/model_executor/layers/fused_moe/routing_simulator.py +++ b/vllm/model_executor/layers/fused_moe/routing_simulator.py @@ -14,6 +14,10 @@ from typing import Any import torch +from vllm.logger import init_logger + +logger = init_logger(__name__) + class RoutingStrategy(ABC): """Base class for token-to-expert routing strategies.""" @@ -290,6 +294,12 @@ class RoutingSimulator: f"Available strategies: " f"{list(RoutingSimulator._routing_strategies.keys())}" ) + logger.warning_once( + "Simulating MoE routing using a %s strategy. " + "This should only be used for performance testing. " + "Model outputs will not be valid.", + strategy_name, + ) strategy = RoutingSimulator._routing_strategies[strategy_name] return strategy.route_tokens( From 7956b0c0bca8c2b778e6a0b18953b5a08e136c90 Mon Sep 17 00:00:00 2001 From: QiliangCui Date: Mon, 3 Nov 2025 16:35:54 -0800 Subject: [PATCH 104/976] Remove the tpu docker image nightly build. (#27997) Signed-off-by: Qiliang Cui --- .buildkite/release-pipeline.yaml | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 33b7114666fa2..12f730738b8a5 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -116,24 +116,6 @@ steps: commands: - "bash .buildkite/scripts/annotate-release.sh" - - label: "Build and publish TPU release image" - depends_on: ~ - if: build.env("NIGHTLY") == "1" - agents: - queue: tpu_queue_postmerge - commands: - - "yes | docker system prune -a" - - "git fetch --all" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ." - - "docker push vllm/vllm-tpu:nightly" - - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" - plugins: - - docker-login#v3.0.0: - username: vllmbot - password-env: DOCKERHUB_TOKEN - env: - DOCKER_BUILDKIT: "1" - - input: "Provide Release version here" id: input-release-version fields: From b13a44754674a0056d7c8113deb33ea858f6ef1c Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 4 Nov 2025 09:12:19 +0800 Subject: [PATCH 105/976] [Bugfix][ROCm] Fix ViT rotary embeddings for torch.compile compatibility on ROCm (#27748) Signed-off-by: vllmellm --- vllm/model_executor/layers/rotary_embedding/common.py | 11 +++++++---- vllm/model_executor/models/glm4_1v.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 9e6ec9fdd523c..196533b617959 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -77,7 +77,11 @@ def dispatch_rotary_emb_function( if current_platform.is_cuda(): return apply_rotary_emb - if current_platform.is_rocm(): + # if torch compile is not enabled + # use rotary embedding function from flash_attn package + # otherwise use the naive pytorch embedding implementation + # is faster when torch compile is enabled. + if current_platform.is_rocm() and not torch.compiler.is_compiling(): if find_spec("flash_attn") is not None: from flash_attn.ops.triton.rotary import apply_rotary @@ -87,11 +91,10 @@ def dispatch_rotary_emb_function( "flash_attn is not installed. Falling back to PyTorch " "implementation for rotary embeddings." ) - if default is not None: return default - else: - return apply_rotary_emb_torch + + return apply_rotary_emb_torch # yarn functions diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 3e243385fd049..121e84469c52f 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -370,7 +370,7 @@ class Glm4vVisionAttention(nn.Module): cu_seqlens_k=cu_seqlens, max_seqlen_q=max_seqlen, max_seqlen_k=max_seqlen, - dropout_p=0, + dropout_p=0.0, causal=False, ) From 6ddae74054d4d9b55b367bfc9db82969f9d81930 Mon Sep 17 00:00:00 2001 From: li2haipeng <44383182+li2haipeng@users.noreply.github.com> Date: Mon, 3 Nov 2025 17:30:20 -0800 Subject: [PATCH 106/976] [LoRA] Lora shrink swizzle (#27694) Signed-off-by: li2haipeng <44383182+li2haipeng@users.noreply.github.com> Signed-off-by: Haipeng Li Co-authored-by: Jee Jee Li --- vllm/lora/ops/triton_ops/lora_shrink_op.py | 15 +++++++++++++-- vllm/lora/ops/triton_ops/utils.py | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py index 8d126197f83ea..adc5c9dce5e84 100644 --- a/vllm/lora/ops/triton_ops/lora_shrink_op.py +++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py @@ -41,6 +41,7 @@ def _lora_shrink_kernel( BLOCK_K: tl.constexpr, EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, SLICE_NUM: tl.constexpr, ): cta_n_num = tl.cdiv(N, BLOCK_N) @@ -48,8 +49,16 @@ def _lora_shrink_kernel( pid_sk_m_n = tl.program_id(axis=0) pid_sk = pid_sk_m_n % SPLIT_K - pid_m = (pid_sk_m_n // SPLIT_K) % cta_m_num - pid_n = pid_sk_m_n // (SPLIT_K * cta_m_num) % cta_n_num + + pid_m_n = pid_sk_m_n // SPLIT_K + num_pid_in_group = GROUP_SIZE_M * cta_n_num + group_id = pid_m_n // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(cta_m_num - first_pid_m, GROUP_SIZE_M) + + # Column-major ordering within groups for better cache reuse + pid_m = first_pid_m + ((pid_m_n % num_pid_in_group) % group_size_m) + pid_n = (pid_m_n % num_pid_in_group) // group_size_m slice_id = tl.program_id(axis=1) lora_idx = tl.program_id(axis=2) @@ -194,6 +203,7 @@ def _lora_shrink( NUM_WARPS = kernel_config["num_warps"] NUM_STAGES = kernel_config["num_stages"] NUM_CTAS = kernel_config["num_ctas"] + GROUP_SIZE_M = kernel_config.get("group_size_m", 8) EVEN_K = K % (BLOCK_K * SPLIT_K) == 0 # type: ignore # TODO (varun): This grid formulation maximizes parallelization at the @@ -233,6 +243,7 @@ def _lora_shrink( BLOCK_K, EVEN_K, SPLIT_K, + GROUP_SIZE_M, NUM_SLICES, num_warps=NUM_WARPS, num_ctas=NUM_CTAS, diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 9ffb6dc3d85e5..368c5037d2e4d 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -199,6 +199,7 @@ def get_lora_op_configs( "split_k": 64 if batch < 128 else 8, "num_warps": 4, "num_ctas": 1, + "group_size_m": 8, "num_stages": 2, "max_nreg": None, } From c02fccdbd2794fe016ebd738e3a8b8c8d78eb78c Mon Sep 17 00:00:00 2001 From: Chauncey Date: Tue, 4 Nov 2025 10:10:10 +0800 Subject: [PATCH 107/976] [Refactor] Lazy import tool_parser (#27974) Signed-off-by: chaunceyjiang --- docs/features/tool_calling.md | 7 +- .../tool_use/test_deepseekv31_tool_parser.py | 4 +- .../tool_use/test_ernie45_moe_tool_parser.py | 2 +- tests/tool_use/test_glm4_moe_tool_parser.py | 4 +- tests/tool_use/test_jamba_tool_parser.py | 2 +- tests/tool_use/test_kimi_k2_tool_parser.py | 2 +- tests/tool_use/test_minimax_tool_parser.py | 2 +- tests/tool_use/test_openai_tool_parser.py | 2 +- tests/tool_use/test_seed_oss_tool_parser.py | 2 +- tests/tool_use/test_xlam_tool_parser.py | 2 +- vllm/entrypoints/openai/api_server.py | 2 +- vllm/entrypoints/openai/cli_args.py | 2 +- .../openai/tool_parsers/__init__.py | 195 +++++++++++++----- .../tool_parsers/abstract_tool_parser.py | 142 +++++++++---- .../tool_parsers/deepseekv31_tool_parser.py | 2 - .../tool_parsers/deepseekv3_tool_parser.py | 2 - .../tool_parsers/ernie45_tool_parser.py | 2 - .../tool_parsers/glm4_moe_tool_parser.py | 2 - .../granite_20b_fc_tool_parser.py | 2 - .../tool_parsers/granite_tool_parser.py | 2 - .../openai/tool_parsers/hermes_tool_parser.py | 2 - .../tool_parsers/hunyuan_a13b_tool_parser.py | 2 - .../tool_parsers/internlm2_tool_parser.py | 2 - .../openai/tool_parsers/jamba_tool_parser.py | 3 +- .../tool_parsers/kimi_k2_tool_parser.py | 2 - .../llama4_pythonic_tool_parser.py | 2 - .../openai/tool_parsers/llama_tool_parser.py | 3 - .../tool_parsers/longcat_tool_parser.py | 2 - .../tool_parsers/minimax_m2_tool_parser.py | 2 - .../tool_parsers/minimax_tool_parser.py | 2 - .../tool_parsers/mistral_tool_parser.py | 2 - .../openai/tool_parsers/olmo3_tool_parser.py | 2 - .../openai/tool_parsers/openai_tool_parser.py | 2 - .../tool_parsers/phi4mini_tool_parser.py | 2 - .../tool_parsers/pythonic_tool_parser.py | 2 - .../tool_parsers/qwen3coder_tool_parser.py | 2 - .../tool_parsers/qwen3xml_tool_parser.py | 2 - .../tool_parsers/seed_oss_tool_parser.py | 2 - .../openai/tool_parsers/step3_tool_parser.py | 2 - .../openai/tool_parsers/xlam_tool_parser.py | 2 - 40 files changed, 266 insertions(+), 158 deletions(-) diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 7a1b30096a56d..7e6c69e717dba 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -407,7 +407,6 @@ Here is a summary of a plugin file: # the name list in register_module can be used # in --tool-call-parser. you can define as many # tool parsers as you want here. - @ToolParserManager.register_module(["example"]) class ExampleToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) @@ -439,6 +438,12 @@ Here is a summary of a plugin file: return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=text) + # register the tool parser to ToolParserManager + ToolParserManager.register_lazy_module( + name="example", + module_path="vllm.entrypoints.openai.tool_parsers.example", + class_name="ExampleToolParser", + ) ``` diff --git a/tests/tool_use/test_deepseekv31_tool_parser.py b/tests/tool_use/test_deepseekv31_tool_parser.py index 9b7e71b49c05b..db5168071fbce 100644 --- a/tests/tool_use/test_deepseekv31_tool_parser.py +++ b/tests/tool_use/test_deepseekv31_tool_parser.py @@ -3,7 +3,9 @@ import pytest -from vllm.entrypoints.openai.tool_parsers import DeepSeekV31ToolParser +from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import ( + DeepSeekV31ToolParser, +) from vllm.transformers_utils.tokenizer import get_tokenizer MODEL = "deepseek-ai/DeepSeek-V3.1" diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py index 0862d14812d72..fb5af6e13a96b 100644 --- a/tests/tool_use/test_ernie45_moe_tool_parser.py +++ b/tests/tool_use/test_ernie45_moe_tool_parser.py @@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import ( FunctionCall, ToolCall, ) -from vllm.entrypoints.openai.tool_parsers import Ernie45ToolParser +from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py index 6f1f6671d9b3c..f545f52c02dcb 100644 --- a/tests/tool_use/test_glm4_moe_tool_parser.py +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -7,7 +7,9 @@ import json import pytest from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall -from vllm.entrypoints.openai.tool_parsers import Glm4MoeModelToolParser +from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import ( + Glm4MoeModelToolParser, +) from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index 6dcdd5ba2ce76..9eb73b80fa9b4 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -9,7 +9,7 @@ import pytest from partial_json_parser.core.options import Allow from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall -from vllm.entrypoints.openai.tool_parsers import JambaToolParser +from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py index 43b8c70acbfc3..c358589dbc292 100644 --- a/tests/tool_use/test_kimi_k2_tool_parser.py +++ b/tests/tool_use/test_kimi_k2_tool_parser.py @@ -7,7 +7,7 @@ import json import pytest from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall -from vllm.entrypoints.openai.tool_parsers import KimiK2ToolParser +from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py index 8610656fa288d..4332984083dab 100644 --- a/tests/tool_use/test_minimax_tool_parser.py +++ b/tests/tool_use/test_minimax_tool_parser.py @@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import ( FunctionCall, ToolCall, ) -from vllm.entrypoints.openai.tool_parsers import MinimaxToolParser +from vllm.entrypoints.openai.tool_parsers.minimax_tool_parser import MinimaxToolParser from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_use/test_openai_tool_parser.py index f6223f3fdce4f..c874a9601ae70 100644 --- a/tests/tool_use/test_openai_tool_parser.py +++ b/tests/tool_use/test_openai_tool_parser.py @@ -15,7 +15,7 @@ from openai_harmony import ( ) from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall -from vllm.entrypoints.openai.tool_parsers import OpenAIToolParser +from vllm.entrypoints.openai.tool_parsers.openai_tool_parser import OpenAIToolParser from vllm.transformers_utils.tokenizer import get_tokenizer MODEL = "gpt2" diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py index 1133b949f2270..1367ad87cb019 100644 --- a/tests/tool_use/test_seed_oss_tool_parser.py +++ b/tests/tool_use/test_seed_oss_tool_parser.py @@ -14,7 +14,7 @@ from vllm.entrypoints.openai.protocol import ( FunctionCall, ToolCall, ) -from vllm.entrypoints.openai.tool_parsers import SeedOssToolParser +from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py index 8c27b2911f8f9..122b427d60409 100644 --- a/tests/tool_use/test_xlam_tool_parser.py +++ b/tests/tool_use/test_xlam_tool_parser.py @@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import ( FunctionCall, ToolCall, ) -from vllm.entrypoints.openai.tool_parsers import xLAMToolParser +from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c37aba2776aeb..e184f22f36307 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1943,7 +1943,7 @@ def create_server_unix_socket(path: str) -> socket.socket: def validate_api_server_args(args): - valid_tool_parses = ToolParserManager.tool_parsers.keys() + valid_tool_parses = ToolParserManager.list_registered() if args.enable_auto_tool_choice and args.tool_call_parser not in valid_tool_parses: raise KeyError( f"invalid tool call parser: {args.tool_call_parser} " diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 1a775d3d68094..476587c178237 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -219,7 +219,7 @@ class FrontendArgs: frontend_kwargs["middleware"]["default"] = [] # Special case: Tool call parser shows built-in options. - valid_tool_parsers = list(ToolParserManager.tool_parsers.keys()) + valid_tool_parsers = list(ToolParserManager.list_registered()) parsers_str = ",".join(valid_tool_parsers) frontend_kwargs["tool_call_parser"]["metavar"] = ( f"{{{parsers_str}}} or name registered in --tool-parser-plugin" diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 4541ca50822f7..7038d4c1f05cc 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -1,61 +1,142 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from .abstract_tool_parser import ToolParser, ToolParserManager -from .deepseekv3_tool_parser import DeepSeekV3ToolParser -from .deepseekv31_tool_parser import DeepSeekV31ToolParser -from .ernie45_tool_parser import Ernie45ToolParser -from .glm4_moe_tool_parser import Glm4MoeModelToolParser -from .granite_20b_fc_tool_parser import Granite20bFCToolParser -from .granite_tool_parser import GraniteToolParser -from .hermes_tool_parser import Hermes2ProToolParser -from .hunyuan_a13b_tool_parser import HunyuanA13BToolParser -from .internlm2_tool_parser import Internlm2ToolParser -from .jamba_tool_parser import JambaToolParser -from .kimi_k2_tool_parser import KimiK2ToolParser -from .llama4_pythonic_tool_parser import Llama4PythonicToolParser -from .llama_tool_parser import Llama3JsonToolParser -from .longcat_tool_parser import LongcatFlashToolParser -from .minimax_m2_tool_parser import MinimaxM2ToolParser -from .minimax_tool_parser import MinimaxToolParser -from .mistral_tool_parser import MistralToolParser -from .olmo3_tool_parser import Olmo3PythonicToolParser -from .openai_tool_parser import OpenAIToolParser -from .phi4mini_tool_parser import Phi4MiniJsonToolParser -from .pythonic_tool_parser import PythonicToolParser -from .qwen3coder_tool_parser import Qwen3CoderToolParser -from .qwen3xml_tool_parser import Qwen3XMLToolParser -from .seed_oss_tool_parser import SeedOssToolParser -from .step3_tool_parser import Step3ToolParser -from .xlam_tool_parser import xLAMToolParser +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, + ToolParserManager, +) -__all__ = [ - "ToolParser", - "ToolParserManager", - "Granite20bFCToolParser", - "GraniteToolParser", - "Hermes2ProToolParser", - "MistralToolParser", - "Internlm2ToolParser", - "Llama3JsonToolParser", - "JambaToolParser", - "Llama4PythonicToolParser", - "LongcatFlashToolParser", - "PythonicToolParser", - "Phi4MiniJsonToolParser", - "DeepSeekV3ToolParser", - "DeepSeekV31ToolParser", - "Ernie45ToolParser", - "xLAMToolParser", - "Olmo3PythonicToolParser", - "MinimaxToolParser", - "KimiK2ToolParser", - "HunyuanA13BToolParser", - "Glm4MoeModelToolParser", - "Qwen3CoderToolParser", - "Qwen3XMLToolParser", - "SeedOssToolParser", - "Step3ToolParser", - "OpenAIToolParser", - "MinimaxM2ToolParser", -] +__all__ = ["ToolParser", "ToolParserManager"] + + +""" +Register a lazy module mapping. + +Example: + ToolParserManager.register_lazy_module( + name="kimi_k2", + module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser", + class_name="KimiK2ToolParser", + ) +""" + + +_TOOL_PARSERS_TO_REGISTER = { + "deepseek_v3": ( # name + "deepseekv3_tool_parser", # filename + "DeepSeekV3ToolParser", # class_name + ), + "deepseek_v31": ( + "deepseekv31_tool_parser", + "DeepSeekV31ToolParser", + ), + "ernie45": ( + "ernie45_tool_parser", + "Ernie45ToolParser", + ), + "glm45": ( + "glm4_moe_tool_parser", + "Glm4MoeModelToolParser", + ), + "granite-20b-fc": ( + "granite_20b_fc_tool_parser", + "Granite20bFCToolParser", + ), + "granite": ( + "granite_tool_parser", + "GraniteToolParser", + ), + "hermes": ( + "hermes_tool_parser", + "Hermes2ProToolParser", + ), + "hunyuan_a13b": ( + "hunyuan_a13b_tool_parser", + "HunyuanA13BToolParser", + ), + "internlm": ( + "internlm2_tool_parser", + "Internlm2ToolParser", + ), + "jamba": ( + "jamba_tool_parser", + "JambaToolParser", + ), + "kimi_k2": ( + "kimi_k2_tool_parser", + "KimiK2ToolParser", + ), + "llama3_json": ( + "llama_tool_parser", + "Llama3JsonToolParser", + ), + "llama4_json": ( + "llama_tool_parser", + "Llama4JsonToolParser", + ), + "llama4_pythonic": ( + "llama4_pythonic_tool_parser", + "Llama4PythonicToolParser", + ), + "longcat": ( + "longcat_tool_parser", + "LongcatFlashToolParser", + ), + "minimax_m2": ( + "minimax_m2_tool_parser", + "MinimaxM2ToolParser", + ), + "minimax": ( + "minimax_tool_parser", + "MinimaxToolParser", + ), + "mistral": ( + "mistral_tool_parser", + "MistralToolParser", + ), + "olmo3": ( + "olmo3_tool_parser", + "Olmo3PythonicToolParser", + ), + "openai": ( + "openai_tool_parser", + "OpenAIToolParser", + ), + "phi4_mini_json": ( + "phi4mini_tool_parser", + "Phi4MiniJsonToolParser", + ), + "pythonic": ( + "pythonic_tool_parser", + "PythonicToolParser", + ), + "qwen3_coder": ( + "qwen3coder_tool_parser", + "Qwen3CoderToolParser", + ), + "qwen3_xml": ( + "qwen3xml_tool_parser", + "Qwen3XmlToolParser", + ), + "seed_oss": ( + "seed_oss_tool_parser", + "SeedOsSToolParser", + ), + "step3": ( + "step3_tool_parser", + "Step3ToolParser", + ), + "xlam": ( + "xlam_tool_parser", + "xLAMToolParser", + ), +} + + +def register_lazy_tool_parsers(): + for name, (file_name, class_name) in _TOOL_PARSERS_TO_REGISTER.items(): + module_path = f"vllm.entrypoints.openai.tool_parsers.{file_name}" + ToolParserManager.register_lazy_module(name, module_path, class_name) + + +register_lazy_tool_parsers() diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 212326fdafb1e..8d520f5bf8ef6 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib import os from collections.abc import Callable, Sequence from functools import cached_property @@ -99,89 +100,158 @@ class ToolParser: class ToolParserManager: - tool_parsers: dict[str, type] = {} + """ + Central registry for ToolParser implementations. + + Supports two modes: + - Eager (immediate) registration via `register_module` + - Lazy registration via `register_lazy_module` + """ + + tool_parsers: dict[str, type[ToolParser]] = {} + lazy_parsers: dict[str, tuple[str, str]] = {} # name -> (module_path, class_name) @classmethod - def get_tool_parser(cls, name) -> type: + def get_tool_parser(cls, name: str) -> type[ToolParser]: """ - Get tool parser by name which is registered by `register_module`. + Retrieve a registered or lazily registered ToolParser class. - Raise a KeyError exception if the name is not registered. + If the parser is lazily registered, + it will be imported and cached on first access. + Raises KeyError if not found. """ if name in cls.tool_parsers: return cls.tool_parsers[name] - raise KeyError(f"tool helper: '{name}' not found in tool_parsers") + if name in cls.lazy_parsers: + return cls._load_lazy_parser(name) + + raise KeyError(f"Tool parser '{name}' not found.") + + @classmethod + def _load_lazy_parser(cls, name: str) -> type[ToolParser]: + """Import and register a lazily loaded parser.""" + module_path, class_name = cls.lazy_parsers[name] + try: + mod = importlib.import_module(module_path) + parser_cls = getattr(mod, class_name) + if not issubclass(parser_cls, ToolParser): + raise TypeError( + f"{class_name} in {module_path} is not a ToolParser subclass." + ) + cls.tool_parsers[name] = parser_cls # cache + return parser_cls + except Exception as e: + logger.exception( + "Failed to import lazy tool parser '%s' from %s: %s", + name, + module_path, + e, + ) + raise @classmethod def _register_module( cls, - module: type, + module: type[ToolParser], module_name: str | list[str] | None = None, force: bool = True, ) -> None: + """Register a ToolParser class immediately.""" if not issubclass(module, ToolParser): raise TypeError( f"module must be subclass of ToolParser, but got {type(module)}" ) + if module_name is None: module_name = module.__name__ + if isinstance(module_name, str): - module_name = [module_name] - for name in module_name: + module_names = [module_name] + elif is_list_of(module_name, str): + module_names = module_name + else: + raise TypeError("module_name must be str, list[str], or None.") + + for name in module_names: if not force and name in cls.tool_parsers: - existed_module = cls.tool_parsers[name] - raise KeyError( - f"{name} is already registered at {existed_module.__module__}" - ) + existed = cls.tool_parsers[name] + raise KeyError(f"{name} is already registered at {existed.__module__}") cls.tool_parsers[name] = module + @classmethod + def register_lazy_module(cls, name: str, module_path: str, class_name: str) -> None: + """ + Register a lazy module mapping. + + Example: + ToolParserManager.register_lazy_module( + name="kimi_k2", + module_path="vllm.entrypoints.openai.tool_parsers.kimi_k2_parser", + class_name="KimiK2ToolParser", + ) + """ + cls.lazy_parsers[name] = (module_path, class_name) + @classmethod def register_module( cls, name: str | list[str] | None = None, force: bool = True, - module: type | None = None, - ) -> type | Callable: + module: type[ToolParser] | None = None, + ) -> type[ToolParser] | Callable[[type[ToolParser]], type[ToolParser]]: """ - Register module with the given name or name list. it can be used as a - decoder(with module as None) or normal function(with module as not - None). + Register module immediately or lazily (as a decorator). + + Usage: + @ToolParserManager.register_module("kimi_k2") + class KimiK2ToolParser(ToolParser): + ... + + Or: + ToolParserManager.register_module(module=SomeToolParser) """ if not isinstance(force, bool): raise TypeError(f"force must be a boolean, but got {type(force)}") - # raise the error ahead of time - if not (name is None or isinstance(name, str) or is_list_of(name, str)): - raise TypeError( - "name must be None, an instance of str, or a sequence of str, " - f"but got {type(name)}" - ) - - # use it as a normal method: x.register_module(module=SomeClass) + # Immediate registration if module is not None: cls._register_module(module=module, module_name=name, force=force) return module - # use it as a decorator: @x.register_module() - def _register(module): - cls._register_module(module=module, module_name=name, force=force) - return module + # Decorator usage + def _decorator(obj: type[ToolParser]) -> type[ToolParser]: + module_path = obj.__module__ + class_name = obj.__name__ - return _register + if isinstance(name, str): + names = [name] + elif is_list_of(name, str): + names = name + else: + names = [class_name] + + for n in names: + # Lazy mapping only: do not import now + cls.lazy_parsers[n] = (module_path, class_name) + + return obj + + return _decorator + + @classmethod + def list_registered(cls) -> list[str]: + """Return names of all eagerly and lazily registered tool parsers.""" + return sorted(set(cls.tool_parsers.keys()) | set(cls.lazy_parsers.keys())) @classmethod def import_tool_parser(cls, plugin_path: str) -> None: - """ - Import a user-defined tool parser by the path of the tool parser define - file. - """ - module_name = os.path.splitext(os.path.basename(plugin_path))[0] + """Import a user-defined parser file from arbitrary path.""" + module_name = os.path.splitext(os.path.basename(plugin_path))[0] try: import_from_path(module_name, plugin_path) except Exception: logger.exception( "Failed to load module '%s' from %s.", module_name, plugin_path ) - return diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py index 14fd5cf0941c6..cbeb879969ece 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py @@ -17,7 +17,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -25,7 +24,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("deepseek_v31") class DeepSeekV31ToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py index b256560fb4beb..bf7f6fa61ab90 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -17,7 +17,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -25,7 +24,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("deepseek_v3") class DeepSeekV3ToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py index e4696334eb135..82370323cb00d 100644 --- a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py @@ -17,7 +17,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -25,7 +24,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("ernie45") class Ernie45ToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): """ diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py index 5081b38240ce6..120e63b929b16 100644 --- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py @@ -20,7 +20,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -28,7 +27,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("glm45") class Glm4MoeModelToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index c5246685f4071..ae9217426fb51 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -21,7 +21,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.entrypoints.openai.tool_parsers.utils import ( consume_space, @@ -35,7 +34,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("granite-20b-fc") class Granite20bFCToolParser(ToolParser): """ Tool call parser for the granite-20b-functioncalling model intended diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index cc1f500342353..d29c427694dc9 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.entrypoints.openai.tool_parsers.utils import ( consume_space, @@ -33,7 +32,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("granite") class GraniteToolParser(ToolParser): """ Tool call parser for the granite 3.0 models. Intended diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index 6332de42f424e..4336a5438109f 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -20,7 +20,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer @@ -28,7 +27,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("hermes") class Hermes2ProToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py index b32e6e39b3e5c..920675c8389b8 100644 --- a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py @@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.entrypoints.openai.tool_parsers.utils import consume_space from vllm.logger import init_logger @@ -29,7 +28,6 @@ from vllm.utils import random_uuid logger = init_logger(__name__) -@ToolParserManager.register_module("hunyuan_a13b") class HunyuanA13BToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index c87bab4353b5b..1dd327f645b3a 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff from vllm.logger import init_logger @@ -28,7 +27,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module(["internlm"]) class Internlm2ToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 21ee2b762cd0a..6f53ddea4f0ef 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -18,7 +18,7 @@ from vllm.entrypoints.openai.protocol import ( FunctionCall, ToolCall, ) -from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager +from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -27,7 +27,6 @@ from vllm.transformers_utils.tokenizers import MistralTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("jamba") class JambaToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py index 3fff3b371dbe3..0453db58361a9 100644 --- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py @@ -17,7 +17,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -25,7 +24,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module(["kimi_k2"]) class KimiK2ToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py index dd622b69525de..1d6de9244066e 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -20,7 +20,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger @@ -31,7 +30,6 @@ class _UnexpectedAstError(Exception): pass -@ToolParserManager.register_module("llama4_pythonic") class Llama4PythonicToolParser(ToolParser): """ Toolcall parser for Llama4 that produce tool calls in a pythonic style diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 8c7b3cefb200e..02fc9b8a4d34e 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -21,7 +21,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.entrypoints.openai.tool_parsers.utils import ( find_common_prefix, @@ -33,8 +32,6 @@ from vllm.logger import init_logger logger = init_logger(__name__) -@ToolParserManager.register_module("llama3_json") -@ToolParserManager.register_module("llama4_json") class Llama3JsonToolParser(ToolParser): """ Tool call parser for Llama 3.x and 4 models intended for use with the diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py index 1dc1a0290c8d9..c6c8ae8ae95f1 100644 --- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py @@ -3,12 +3,10 @@ import regex as re -from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ToolParserManager from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser from vllm.transformers_utils.tokenizer import AnyTokenizer -@ToolParserManager.register_module("longcat") class LongcatFlashToolParser(Hermes2ProToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py index d083ece892d50..05f4826028c12 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py @@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -27,7 +26,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("minimax_m2") class MinimaxM2ToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py index 4b12bf68b3670..982518a52e3da 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py @@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff from vllm.logger import init_logger @@ -28,7 +27,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("minimax") class MinimaxToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index dbdf0085367bc..85671271522d3 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -22,7 +22,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff from vllm.logger import init_logger @@ -53,7 +52,6 @@ def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool: ) -@ToolParserManager.register_module("mistral") class MistralToolParser(ToolParser): """ Tool call parser for Mistral 7B Instruct v0.3, intended for use with diff --git a/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py index ed5633aac02d4..baff33bd7e8ac 100644 --- a/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/olmo3_tool_parser.py @@ -20,7 +20,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger @@ -31,7 +30,6 @@ class _UnexpectedAstError(Exception): pass -@ToolParserManager.register_module("olmo3") class Olmo3PythonicToolParser(ToolParser): """ Tool call parser for Olmo 3 models that produce tool calls as diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py index f44876943ac28..d1b36a297e0b1 100644 --- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py @@ -14,7 +14,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger @@ -26,7 +25,6 @@ else: logger = init_logger(__name__) -@ToolParserManager.register_module("openai") class OpenAIToolParser(ToolParser): def __init__(self, tokenizer: "AnyTokenizer"): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index a8387ba1494df..acb25ea2768e1 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -18,14 +18,12 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger logger = init_logger(__name__) -@ToolParserManager.register_module("phi4_mini_json") class Phi4MiniJsonToolParser(ToolParser): """ Tool call parser for phi-4-mini models intended for use with the diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 4945e7b5ab20a..abeb923b93227 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -21,7 +21,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger @@ -32,7 +31,6 @@ class _UnexpectedAstError(Exception): pass -@ToolParserManager.register_module("pythonic") class PythonicToolParser(ToolParser): """ Tool call parser for models that produce tool calls in a pythonic style, diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py index ad56972e6387e..26261c0065ead 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py @@ -20,7 +20,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -28,7 +27,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("qwen3_coder") class Qwen3CoderToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py index 9964d1ac25c40..cf2fa30d01547 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py @@ -21,7 +21,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -1165,7 +1164,6 @@ class StreamingXMLToolCallParser: self.deferred_param_raw_value = "" -@ToolParserManager.register_module("qwen3_xml") class Qwen3XMLToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py index f50a2df53bc04..8aed7f0e9fc96 100644 --- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py @@ -23,7 +23,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -31,7 +30,6 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) -@ToolParserManager.register_module("seed_oss") class SeedOssToolParser(ToolParser): TOOL_CALL_START = "" TOOL_CALL_END = "" diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py index d0255ec085391..adcb9f4765473 100644 --- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py @@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -28,7 +27,6 @@ from vllm.utils import random_uuid logger = init_logger(__name__) -@ToolParserManager.register_module(["step3"]) class Step3ToolParser(ToolParser): """ Tool parser for a model that uses a specific XML-like format for tool calls. diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py index c1f0d29cc0873..9d308af4de601 100644 --- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py @@ -19,7 +19,6 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, - ToolParserManager, ) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -28,7 +27,6 @@ from vllm.utils import random_uuid logger = init_logger(__name__) -@ToolParserManager.register_module("xlam") class xLAMToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) From 14a125a06df7275923fe9748f67e27e449412d1f Mon Sep 17 00:00:00 2001 From: liuzhenwei Date: Tue, 4 Nov 2025 11:28:35 +0800 Subject: [PATCH 108/976] [NIXL][XPU] Pin NIXL version to 0.7.0 (#27849) Signed-off-by: zhenwei-intel --- tools/install_nixl_from_source_ubuntu.py | 31 ++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py index 742aab6b0de75..4a20b6b7bb8fb 100644 --- a/tools/install_nixl_from_source_ubuntu.py +++ b/tools/install_nixl_from_source_ubuntu.py @@ -3,9 +3,11 @@ # install_prerequisites.py import argparse import glob +import json import os import subprocess import sys +import urllib.request # --- Configuration --- WHEELS_CACHE_HOME = os.environ.get("WHEELS_CACHE_HOME", "/tmp/wheels_cache") @@ -18,6 +20,20 @@ NIXL_REPO_URL = "https://github.com/ai-dynamo/nixl.git" # --- Helper Functions --- +def get_latest_nixl_version(): + """Helper function to get latest release version of NIXL""" + try: + nixl_release_url = "https://api.github.com/repos/ai-dynamo/nixl/releases/latest" + with urllib.request.urlopen(nixl_release_url) as response: + data = json.load(response) + return data.get("tag_name", "0.7.0") + except Exception: + return "0.7.0" + + +NIXL_VERSION = os.environ.get("NIXL_VERSION", get_latest_nixl_version()) + + def run_command(command, cwd=".", env=None): """Helper function to run a shell command and check for errors.""" print(f"--> Running command: {' '.join(command)} in '{cwd}'", flush=True) @@ -37,7 +53,7 @@ def is_pip_package_installed(package_name): def find_nixl_wheel_in_cache(cache_dir): """Finds a nixl wheel file in the specified cache directory.""" # The repaired wheel will have a 'manylinux' tag, but this glob still works. - search_pattern = os.path.join(cache_dir, "nixl*.whl") + search_pattern = os.path.join(cache_dir, f"nixl*{NIXL_VERSION}*.whl") wheels = glob.glob(search_pattern) if wheels: # Sort to get the most recent/highest version if multiple exist @@ -146,6 +162,10 @@ def build_and_install_prerequisites(args): print("\n[2/3] Building NIXL wheel from source...", flush=True) if not os.path.exists(NIXL_DIR): run_command(["git", "clone", NIXL_REPO_URL, NIXL_DIR]) + else: + run_command(["git", "fetch", "--tags"], cwd=NIXL_DIR) + run_command(["git", "checkout", NIXL_VERSION], cwd=NIXL_DIR) + print(f"--> Checked out NIXL version: {NIXL_VERSION}", flush=True) build_env = os.environ.copy() build_env["PKG_CONFIG_PATH"] = os.path.join(ucx_install_path, "lib", "pkgconfig") @@ -203,7 +223,14 @@ def build_and_install_prerequisites(args): {os.path.basename(newly_built_wheel)}. Now installing...", flush=True, ) - install_command = [sys.executable, "-m", "pip", "install", newly_built_wheel] + install_command = [ + sys.executable, + "-m", + "pip", + "install", + "--no-deps", # w/o "no-deps", it will install cuda-torch + newly_built_wheel, + ] if args.force_reinstall: install_command.insert(-1, "--force-reinstall") From 380ba6816d4646be99d9b6d207ba7bc7fce8290e Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Tue, 4 Nov 2025 04:35:36 +0000 Subject: [PATCH 109/976] [Metrics] Enable sleep state metric outside of dev mode (#27867) Signed-off-by: Mark McLoughlin --- vllm/v1/metrics/loggers.py | 50 ++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 67b6ceaa847f6..e85f85bfb0aab 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -9,7 +9,6 @@ from typing import TypeAlias from prometheus_client import Counter, Gauge, Histogram -import vllm.envs as envs from vllm.config import SupportsMetricsInfo, VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( KVConnectorLogging, @@ -395,32 +394,32 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.gauge_scheduler_waiting = make_per_engine( gauge_scheduler_waiting, engine_indexes, model_name ) - if envs.VLLM_SERVER_DEV_MODE: - gauge_engine_sleep_state = self._gauge_cls( - name="vllm:engine_sleep_state", - documentation=( - "Engine sleep state; awake = 0 means engine is sleeping; " - "awake = 1 means engine is awake; " - "weights_offloaded = 1 means sleep level 1; " - "discard_all = 1 means sleep level 2." - ), - labelnames=labelnames + ["sleep_state"], - multiprocess_mode="mostrecent", - ) - self.gauge_engine_sleep_state = {} - sleep_state = ["awake", "weights_offloaded", "discard_all"] + gauge_engine_sleep_state = self._gauge_cls( + name="vllm:engine_sleep_state", + documentation=( + "Engine sleep state; awake = 0 means engine is sleeping; " + "awake = 1 means engine is awake; " + "weights_offloaded = 1 means sleep level 1; " + "discard_all = 1 means sleep level 2." + ), + labelnames=labelnames + ["sleep_state"], + multiprocess_mode="mostrecent", + ) - for s in sleep_state: - self.gauge_engine_sleep_state[s] = { - idx: gauge_engine_sleep_state.labels( - engine=idx, model_name=model_name, sleep_state=s - ) - for idx in engine_indexes - } + self.gauge_engine_sleep_state = {} + sleep_state = ["awake", "weights_offloaded", "discard_all"] - # Setting default values - self.record_sleep_state() + for s in sleep_state: + self.gauge_engine_sleep_state[s] = { + idx: gauge_engine_sleep_state.labels( + engine=idx, model_name=model_name, sleep_state=s + ) + for idx in engine_indexes + } + + # Setting default values + self.record_sleep_state() # GPU cache # @@ -1052,9 +1051,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time() def record_sleep_state(self, sleep: int = 0, level: int = 0): - if not envs.VLLM_SERVER_DEV_MODE: - return - awake = 1 discard_all = 0 weights_offloaded = 0 From 7e4be741044bfead91afc418100ff9a4d804bf7f Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 4 Nov 2025 01:05:55 -0500 Subject: [PATCH 110/976] [Bug] Batch invariant: Fix flash attn MLA `RuntimeError: scheduler_metadata must have shape (metadata_size)` (#27884) --- vllm/model_executor/layers/batch_invariant.py | 2 ++ vllm/v1/attention/backends/mla/flashattn_mla.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 39e77b935d3d5..0234f228d700a 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib +import functools import os from collections import namedtuple from collections.abc import Callable @@ -846,6 +847,7 @@ def get_batch_invariant_attention_block_size() -> AttentionBlockSize: return AttentionBlockSize(block_m=16, block_n=16) +@functools.cache def vllm_is_batch_invariant(): env_key = "VLLM_BATCH_INVARIANT" is_overridden = False diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index a6aac701b784b..6baf45efccb54 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -163,6 +163,9 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] # we only set num_splits when using cuda graphs. max_num_splits = self.max_num_splits + if vllm_is_batch_invariant(): + max_num_splits = 1 + scheduler_metadata = self._schedule_decode( num_reqs=seq_lens_cpu.numel(), cu_query_lens=query_start_loc_device, @@ -188,9 +191,6 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] self.scheduler_metadata[n:] = 0 scheduler_metadata = self.scheduler_metadata[:n] - if vllm_is_batch_invariant(): - max_num_splits = 1 - metadata = FlashAttnMLADecodeMetadata( block_table=block_table_tensor, seq_lens=seq_lens_device, From f32cbc9a0c456966df300076a3a9f2889151b024 Mon Sep 17 00:00:00 2001 From: xiangze-arm Date: Tue, 4 Nov 2025 14:33:23 +0800 Subject: [PATCH 111/976] [CPU]Improve dynamic 4bit moe performance (#27240) Signed-off-by: Zhang Xiangze --- csrc/moe/dynamic_4bit_int_moe_cpu.cpp | 33 ++++++++++----------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp index 1d06fc6b5b0a0..df47bb8dd1d7d 100644 --- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp +++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp @@ -87,30 +87,23 @@ torch::Tensor dynamic_4bit_int_moe_cpu( const int64_t g_eff_13 = (group_size != -1) ? group_size : H; const int64_t g_eff_2 = (group_size != -1) ? group_size : I; - // Per-expert outputs filled in parallel - std::vector y_list(E); - y_list.resize(E); + auto X_all = x_c.index_select(/*dim=*/0, expert_tokens); + if (apply_router_weight_on_input) { + X_all = X_all.mul(expert_gates.unsqueeze(1)); + } + auto Y_all = at::empty({offsets[E], H}, x_c.options()); at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) { + c10::InferenceMode guard; for (int64_t e = e_begin; e < e_end; ++e) { const int64_t te = counts[e]; if (te == 0) { - y_list[e] = at::empty({0, H}, x_c.options()); continue; } const int64_t start = offsets[e]; - auto sel_tokens = - expert_tokens.narrow(/*dim=*/0, /*start=*/start, /*length=*/te); - auto gates_e = - expert_gates.narrow(/*dim=*/0, /*start=*/start, /*length=*/te); - - auto x_e = x_c.index_select(/*dim=*/0, sel_tokens); - - if (apply_router_weight_on_input) { - x_e = x_e.mul(gates_e.unsqueeze(1)); - } + auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te); auto w13_e = w13_packed.select(/*dim=*/0, e); auto w2_e = w2_packed.select(/*dim=*/0, e); @@ -137,17 +130,15 @@ torch::Tensor dynamic_4bit_int_moe_cpu( // W2 auto y = mm(act, w2_e, g_eff_2, /*in_features=*/I, /*out_features=*/H); - if (!apply_router_weight_on_input) { - y = y.mul(gates_e.unsqueeze(1)); - } - // Store per-expert result - y_list[e] = y; + Y_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te).copy_(y); } }); - // Concatenate all expert outputs to match expert_tokens order - auto Y_all = at::cat(y_list, /*dim=*/0); + if (!apply_router_weight_on_input) { + Y_all = Y_all.mul(expert_gates.unsqueeze(1)); + } + auto out = at::zeros({T, H}, x.options()); out = at::index_add(out, /*dim=*/0, /*index=*/expert_tokens, /*source=*/Y_all); From 2f84ae1f27eb628a195ee9ccd4e884baeb451d1c Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Mon, 3 Nov 2025 22:36:40 -0800 Subject: [PATCH 112/976] [CI/Build] Update LM Eval Version in AMD CI (#27944) Signed-off-by: zhewenli --- docker/Dockerfile.rocm | 1 - requirements/rocm-test.txt | 15 +++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index adb0879f20d47..06d229f315bdc 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -75,7 +75,6 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace RUN cd /vllm-workspace \ && rm -rf vllm \ && python3 -m pip install -e tests/vllm_test_utils \ - && python3 -m pip install lm-eval[api]==0.4.4 \ && python3 -m pip install pytest-shard # ----------------------- diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 541fa1e267cb0..432e11977872d 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -4,7 +4,7 @@ tblib==3.1.0 bm25s==0.2.13 pystemmer==3.0.0 -# entrypoints test +# Entrypoints test # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai audioread==3.0.1 cffi==1.17.1 @@ -17,11 +17,11 @@ soundfile==0.13.1 soxr==0.5.0.post1 librosa==0.10.2.post1 -# entrypoints test +# Entrypoints test #vllm[video] # required by entrypoints/openai/test_video.py decord==0.6.0 -# entrypoints test +# Entrypoints test #sentence-transformers # required by entrypoints/openai/test_score.py sentence-transformers==3.4.1 @@ -32,7 +32,10 @@ matplotlib==3.10.3 blobfile==3.0.0 # Required for openai schema test. -schemathesis==3.39.15 +schemathesis==3.39.15 -# required for mteb test -mteb[bm25s]>=1.38.11, <2 +# Required for mteb test +mteb[bm25s]>=1.38.11, <2 + +# Required for eval tests +lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d From 58279c60b52c7e6e286799a313416949f43aeefe Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Tue, 4 Nov 2025 07:00:49 +0000 Subject: [PATCH 113/976] [KV Connector] Make KVCacheConfig an explicit constructor argument (#27887) Signed-off-by: Mark McLoughlin --- .../unit/test_backwards_compatibility.py | 275 ++++++++++++++++++ tests/v1/kv_connector/unit/utils.py | 2 +- .../kv_transfer/kv_connector/factory.py | 41 ++- .../kv_transfer/kv_connector/v1/base.py | 16 +- .../kv_connector/v1/decode_bench_connector.py | 12 +- .../kv_connector/v1/lmcache_connector.py | 12 +- .../kv_connector/v1/multi_connector.py | 14 +- .../kv_connector/v1/nixl_connector.py | 12 +- .../kv_connector/v1/offloading_connector.py | 10 +- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 16 +- .../v1/shared_storage_connector.py | 16 +- .../kv_transfer/kv_transfer_state.py | 11 +- vllm/v1/core/sched/scheduler.py | 12 +- vllm/v1/worker/gpu_worker.py | 4 +- 14 files changed, 410 insertions(+), 43 deletions(-) create mode 100644 tests/v1/kv_connector/unit/test_backwards_compatibility.py diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py new file mode 100644 index 0000000000000..f51001a6ec12a --- /dev/null +++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py @@ -0,0 +1,275 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Unit tests for backwards compatibility with external KV connector implementations. + +This test ensures that external connectors (loaded via kv_connector_module_path) +implemented with the old signature continue to work: +- Old signature: __init__(self, vllm_config, role) +- New signature: __init__(self, vllm_config, role, kv_cache_config) +""" + +from typing import TYPE_CHECKING +from unittest.mock import patch + +import pytest + +from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory +from vllm.distributed.kv_transfer.kv_connector.v1 import ( + KVConnectorBase_V1, + KVConnectorRole, +) +from vllm.v1.core.sched.output import SchedulerOutput + +from .utils import create_scheduler, create_vllm_config + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.config import VllmConfig + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig + from vllm.v1.request import Request + + +class OldStyleTestConnector(KVConnectorBase_V1): + """ + Test connector using the old signature with 2 required arguments. + This simulates external connectors that haven't been updated yet. + """ + + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): + # Old-style call to super().__init__ with only 2 arguments + super().__init__(vllm_config=vllm_config, role=role) + + def get_num_new_matched_tokens( + self, request: "Request", num_computed_tokens: int + ) -> tuple[int | None, bool]: + return 0, False + + def update_state_after_alloc( + self, + request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int, + ): + pass + + def build_connector_meta(self, scheduler_output: SchedulerOutput): + return None + + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: + pass + + def wait_for_layer_load(self, layer_name: str) -> None: + pass + + def save_kv_layer( + self, + layer_name: str, + kv_layer, + attn_metadata: "AttentionMetadata", + **kwargs, + ) -> None: + pass + + def wait_for_save(self): + pass + + +class NewStyleTestConnector(KVConnectorBase_V1): + """ + Test connector using the new signature with 3 required arguments. + """ + + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: "KVCacheConfig", + ): + # New-style call to super().__init__ with all 3 arguments + super().__init__( + vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config + ) + + def get_num_new_matched_tokens( + self, request: "Request", num_computed_tokens: int + ) -> tuple[int | None, bool]: + return 0, False + + def update_state_after_alloc( + self, + request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int, + ): + pass + + def build_connector_meta(self, scheduler_output: SchedulerOutput): + return None + + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: + pass + + def wait_for_layer_load(self, layer_name: str) -> None: + pass + + def save_kv_layer( + self, + layer_name: str, + kv_layer, + attn_metadata: "AttentionMetadata", + **kwargs, + ) -> None: + pass + + def wait_for_save(self): + pass + + +@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER]) +def test_external_old_signature_factory_instantiation(role): + """ + Test that external connectors with old signature (2 required args) loaded + via kv_connector_module_path are correctly instantiated with backwards + compatibility support. + """ + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_connector = "OldStyleTestConnector" + vllm_config.kv_transfer_config.kv_connector_module_path = ( + "tests.v1.kv_connector.unit.test_backwards_compatibility" + ) + + scheduler = create_scheduler(vllm_config) + kv_cache_config = scheduler.kv_cache_config + + connector = KVConnectorFactory.create_connector(vllm_config, role, kv_cache_config) + + assert connector is not None + assert isinstance(connector, OldStyleTestConnector) + assert connector.role == role + assert connector._kv_cache_config is None + + +@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER]) +def test_external_new_signature_factory_instantiation(role): + """ + Test that external connectors with new signature (3 required args) loaded + via kv_connector_module_path are correctly instantiated. + """ + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_connector = "NewStyleTestConnector" + vllm_config.kv_transfer_config.kv_connector_module_path = ( + "tests.v1.kv_connector.unit.test_backwards_compatibility" + ) + + scheduler = create_scheduler(vllm_config) + kv_cache_config = scheduler.kv_cache_config + + connector = KVConnectorFactory.create_connector(vllm_config, role, kv_cache_config) + + assert connector is not None + assert isinstance(connector, NewStyleTestConnector) + assert connector.role == role + assert connector._kv_cache_config is not None + assert connector._kv_cache_config == kv_cache_config + + +@pytest.mark.parametrize("role", [KVConnectorRole.SCHEDULER, KVConnectorRole.WORKER]) +def test_old_signature_super_init(role): + """ + Test that old-style connectors can call super().__init__() without + kv_cache_config parameter. + """ + vllm_config = create_vllm_config() + + connector = OldStyleTestConnector(vllm_config, role) + + assert connector is not None + assert connector.role == role + assert connector._kv_cache_config is None + + +def test_old_signature_super_init_with_kwargs(): + """ + Test that old-style connectors can call super().__init__() with keyword + arguments in different orders. + """ + vllm_config = create_vllm_config() + + # Test with vllm_config= and role= kwargs + connector1 = OldStyleTestConnector( + vllm_config=vllm_config, role=KVConnectorRole.SCHEDULER + ) + assert connector1 is not None + assert connector1._kv_cache_config is None + + # Test with role= and vllm_config= in reversed order + connector2 = OldStyleTestConnector( + role=KVConnectorRole.WORKER, vllm_config=vllm_config + ) + assert connector2 is not None + assert connector2._kv_cache_config is None + + +def test_internal_connector_uses_new_signature(): + """ + Test that internal connectors (registered in factory) always use the new + signature and get kv_cache_config. + """ + from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( + SharedStorageConnector, + ) + + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_connector = "SharedStorageConnector" + + scheduler = create_scheduler(vllm_config) + kv_cache_config = scheduler.kv_cache_config + + connector = KVConnectorFactory.create_connector( + vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config + ) + + assert connector is not None + assert isinstance(connector, SharedStorageConnector) + assert connector._kv_cache_config is not None + assert connector._kv_cache_config == kv_cache_config + + +def test_signature_detection_with_mocking(): + """ + Test that the factory correctly applies compat_sig flag returned from + _get_connector_class_with_compat. + """ + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + kv_cache_config = scheduler.kv_cache_config + + # Mock _get_connector_class_with_compat to return old-style connector + with patch.object( + KVConnectorFactory, + "_get_connector_class_with_compat", + return_value=(OldStyleTestConnector, True), + ): + old_connector = KVConnectorFactory.create_connector( + vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config + ) + assert old_connector is not None + assert isinstance(old_connector, OldStyleTestConnector) + assert old_connector._kv_cache_config is None + + # Mock _get_connector_class_with_compat to return new-style connector + with patch.object( + KVConnectorFactory, + "_get_connector_class_with_compat", + return_value=(NewStyleTestConnector, False), + ): + new_connector = KVConnectorFactory.create_connector( + vllm_config, KVConnectorRole.SCHEDULER, kv_cache_config + ) + assert new_connector is not None + assert isinstance(new_connector, NewStyleTestConnector) + assert new_connector._kv_cache_config is not None + assert new_connector._kv_cache_config == kv_cache_config diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 46ea46e53084e..c1c0e13f77539 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -254,7 +254,7 @@ def create_model_runner_output( class TestSharedStorageConnector(SharedStorageConnector): - def __init__(self, config: VllmConfig, role): + def __init__(self, config: VllmConfig, role, kv_cache_config): self.name = config.kv_transfer_config.kv_connector_extra_config["name"] self._connector = SharedStorageConnector(config, role) self.call_record: dict[str, int] = defaultdict(int) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index c64996f13cd5d..8d14200c52407 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -3,10 +3,9 @@ import importlib from collections.abc import Callable -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Optional, cast import vllm.envs as envs -from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.base import ( KVConnectorBase, KVConnectorBaseType, @@ -16,9 +15,12 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import ( supports_hma, ) from vllm.logger import init_logger +from vllm.utils.func_utils import supports_kw if TYPE_CHECKING: + from vllm.config import VllmConfig from vllm.config.kv_transfer import KVTransferConfig + from vllm.v1.kv_cache_interface import KVCacheConfig logger = init_logger(__name__) @@ -41,8 +43,9 @@ class KVConnectorFactory: @classmethod def create_connector( cls, - config: VllmConfig, + config: "VllmConfig", role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, ) -> KVConnectorBase: if not envs.VLLM_USE_V1: raise ValueError( @@ -53,7 +56,9 @@ class KVConnectorFactory: kv_transfer_config = config.kv_transfer_config if kv_transfer_config is None: raise ValueError("kv_transfer_config must be set to create a connector") - connector_cls = cls.get_connector_class(kv_transfer_config) + connector_cls, compat_sig = cls._get_connector_class_with_compat( + kv_transfer_config + ) # check if the connector supports HMA hma_enabled = not config.scheduler_config.disable_hybrid_kv_cache_manager @@ -76,7 +81,12 @@ class KVConnectorFactory: # - Co-locate with worker process # - Should only be used inside the forward context & attention layer # We build separately to enforce strict separation - return connector_cls(config, role) + if compat_sig: + # Old signature: __init__(self, vllm_config, role) + return connector_cls(config, role) + else: + # New signature: __init__(self, vllm_config, role, kv_cache_config) + return connector_cls(config, role, kv_cache_config) @classmethod def get_connector_class_by_name( @@ -97,13 +107,13 @@ class KVConnectorFactory: return cls._registry[connector_name]() @classmethod - def get_connector_class( + def _get_connector_class_with_compat( cls, kv_transfer_config: "KVTransferConfig" - ) -> type[KVConnectorBaseType]: - """Get the connector class by name.""" + ) -> tuple[type[KVConnectorBaseType], bool]: connector_name = kv_transfer_config.kv_connector if connector_name is None: raise ValueError("Connector name is not set in KVTransferConfig") + compat_sig = False if connector_name in cls._registry: connector_cls = cls._registry[connector_name]() else: @@ -118,6 +128,21 @@ class KVConnectorFactory: f"Class {connector_name} not found in {connector_module_path}" ) from e connector_cls = cast(type[KVConnectorBaseType], connector_cls) + if not supports_kw(connector_cls, "kv_cache_config"): + compat_sig = True + logger.warning( + "Connector %s uses deprecated signature with 2 required arguments. " + "Please update to include kv_cache_config as the second argument.", + connector_cls.__name__, + ) + return connector_cls, compat_sig + + @classmethod + def get_connector_class( + cls, kv_transfer_config: "KVTransferConfig" + ) -> type[KVConnectorBaseType]: + """Get the connector class by name.""" + connector_cls, _ = cls._get_connector_class_with_compat(kv_transfer_config) return connector_cls diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index cb9f208a839f2..354aa9a87183d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -58,6 +58,7 @@ if TYPE_CHECKING: ) from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request # s_tensor_list, d_tensor_list, s_indices, d_indices, direction @@ -141,7 +142,12 @@ class KVConnectorMetadata(ABC): # noqa: B024 class KVConnectorBase_V1(ABC): - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): logger.warning( "Initializing KVConnectorBase_V1. This API is experimental and " "subject to change in the future as we iterate the design." @@ -152,6 +158,14 @@ class KVConnectorBase_V1(ABC): self._kv_transfer_config = vllm_config.kv_transfer_config else: raise ValueError("kv_transfer_config must be set for KVConnectorBase_V1") + self._kv_cache_config = kv_cache_config + if self._kv_cache_config is None: + logger.warning( + "KVConnectorBase_V1 initialized without kv_cache_config. " + "This is deprecated - please update your connector to accept " + "kv_cache_config as the third constructor argument and pass it " + "to super().__init__()." + ) self._role = role @property diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py index ca251cd0c6ebd..9cd7d93c92fa3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py @@ -32,7 +32,7 @@ Usage: """ from dataclasses import dataclass -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional import torch @@ -50,6 +50,7 @@ if TYPE_CHECKING: from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request logger = init_logger(__name__) @@ -79,8 +80,13 @@ class DecodeBenchConnector(KVConnectorBase_V1): testing of the decoder with larger input sequence lengths. """ - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): - super().__init__(vllm_config, role) + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + super().__init__(vllm_config, role, kv_cache_config) self.connector_scheduler: DecodeBenchConnectorScheduler | None = None self.connector_worker: DecodeBenchConnectorWorker | None = None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 7232d947030cb..575ab468be566 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -20,14 +20,22 @@ if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request logger = init_logger(__name__) class LMCacheConnectorV1(KVConnectorBase_V1): - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): - super().__init__(vllm_config=vllm_config, role=role) + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: "KVCacheConfig", + ): + super().__init__( + vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config + ) assert vllm_config.kv_transfer_config is not None use_native = vllm_config.kv_transfer_config.get_from_extra_config( "use_native", False diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index d56f30bd11e5b..d7bbf02c83677 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -31,6 +31,7 @@ if TYPE_CHECKING: from vllm.distributed.kv_events import KVCacheEvent from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request logger = init_logger(__name__) @@ -109,15 +110,22 @@ class MultiConnector(KVConnectorBase_V1): - Save to all connectors. """ - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): - super().__init__(vllm_config=vllm_config, role=role) + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: "KVCacheConfig", + ): + super().__init__( + vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config + ) self._connectors: list[KVConnectorBase_V1] = [] self._ktc_kv_transfer_config = [] for connector_cls, temp_config in self._get_connector_classes_and_configs( vllm_config ): - self._connectors.append(connector_cls(temp_config, role)) + self._connectors.append(connector_cls(temp_config, role, kv_cache_config)) self._ktc_kv_transfer_config.append(temp_config.kv_transfer_config) # A mapping from request id to the index of the connector chosen to diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 4651cedbc7dfa..ff9770b72bd38 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -13,7 +13,7 @@ from collections import defaultdict from collections.abc import Iterator from concurrent.futures import Future, ThreadPoolExecutor from dataclasses import dataclass -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional import msgspec import numpy as np @@ -52,6 +52,7 @@ from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request Transfer = tuple[int, float] # (xfer_handle, start_time) @@ -150,7 +151,14 @@ class NixlConnectorMetadata(KVConnectorMetadata): class NixlConnector(KVConnectorBase_V1): - def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): + def __init__( + self, + vllm_config: VllmConfig, + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + super().__init__(vllm_config, role, kv_cache_config) + assert vllm_config.kv_transfer_config is not None assert vllm_config.kv_transfer_config.engine_id is not None self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 7567c7fae5789..582e42cc466ae 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -21,6 +21,7 @@ from vllm.logger import init_logger from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_utils import BlockHash from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.kv_offload.abstract import OffloadingManager from vllm.v1.kv_offload.factory import OffloadingSpecFactory from vllm.v1.kv_offload.mediums import GPULoadStoreSpec @@ -41,8 +42,13 @@ class OffloadingConnectorMetadata(KVConnectorMetadata): class OffloadingConnector(KVConnectorBase_V1): - def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): - super().__init__(vllm_config, role) + def __init__( + self, + vllm_config: VllmConfig, + role: KVConnectorRole, + kv_cache_config: KVCacheConfig | None = None, + ): + super().__init__(vllm_config, role, kv_cache_config) spec = OffloadingSpecFactory.create_spec(vllm_config) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 780dd12fccda3..a124a0d519db8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional import regex as re import torch @@ -25,6 +25,7 @@ if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request logger = init_logger(__name__) @@ -71,8 +72,17 @@ class P2pNcclConnectorMetadata(KVConnectorMetadata): class P2pNcclConnector(KVConnectorBase_V1): - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): - super().__init__(vllm_config=vllm_config, role=role) + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + super().__init__( + vllm_config=vllm_config, + role=role, + kv_cache_config=kv_cache_config, + ) self._block_size = vllm_config.cache_config.block_size self._requests_need_load: dict[str, Any] = {} self.is_producer = self._kv_transfer_config.is_kv_producer diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 9c230d7d0d2f4..016d1d45b3593 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -3,7 +3,7 @@ import hashlib import os from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional import safetensors import torch @@ -22,6 +22,7 @@ if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request logger = init_logger(__name__) @@ -86,8 +87,17 @@ class SharedStorageConnector(KVConnectorBase_V1): # It does extra work which will overwrite the existing prefix-cache in GPU # - to remove the overhead, need to add some "mask" in the ReqMeta class - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): - super().__init__(vllm_config=vllm_config, role=role) + def __init__( + self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + kv_cache_config: Optional["KVCacheConfig"] = None, + ): + super().__init__( + vllm_config=vllm_config, + role=role, + kv_cache_config=kv_cache_config, + ) self._block_size = vllm_config.cache_config.block_size self._requests_need_load: dict[str, Request] = {} self._storage_path = self._kv_transfer_config.get_from_extra_config( diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index cabfc10e7f942..7501f0b373d46 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional from vllm import envs from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType @@ -12,6 +12,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import ( if TYPE_CHECKING: from vllm.config import VllmConfig + from vllm.v1.kv_cache_interface import KVCacheConfig _KV_CONNECTOR_AGENT: KVConnectorBaseType | None = None @@ -48,7 +49,9 @@ def is_v1_kv_transfer_group(connector: KVConnectorBaseType | None = None) -> boo return isinstance(connector, KVConnectorBase_V1) -def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: +def ensure_kv_transfer_initialized( + vllm_config: "VllmConfig", kv_cache_config: Optional["KVCacheConfig"] = None +) -> None: """ Initialize KV cache transfer parallel group. """ @@ -64,7 +67,9 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: ): if envs.VLLM_USE_V1: _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector( - config=vllm_config, role=KVConnectorRole.WORKER + config=vllm_config, + role=KVConnectorRole.WORKER, + kv_cache_config=kv_cache_config, ) else: raise ValueError("V0 is no longer supported") diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index f51744eb2640b..aeb9869c52813 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import copy import itertools import time from collections import defaultdict @@ -92,15 +91,10 @@ class Scheduler(SchedulerInterface): assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported with KV connectors" ) - - connector_vllm_config = copy.copy(self.vllm_config) - - # We're dynamically inserting a kv_cache_config variable into the - # connector_vllm_config. This is distinct from the cache_config - # that is already in there. - connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config) # type: ignore[attr-defined] self.connector = KVConnectorFactory.create_connector( - config=connector_vllm_config, role=KVConnectorRole.SCHEDULER + config=self.vllm_config, + role=KVConnectorRole.SCHEDULER, + kv_cache_config=self.kv_cache_config, ) if self.log_stats: self.connector_prefix_cache_stats = PrefixCacheStats() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index c2bf1419bebd7..f3fe202cec062 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -380,9 +380,7 @@ class Worker(WorkerBase): # NOTE(Kuntai): This need to be done before `initialize_kv_cache`, # because `initialize_kv_cache` will inject kv cache groups not # related to kv cache connector (e.g. kv cache sharing layers). - connector_vllm_config = copy.copy(self.vllm_config) - connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config) - ensure_kv_transfer_initialized(connector_vllm_config) + ensure_kv_transfer_initialized(self.vllm_config, kv_cache_config) if self.vllm_config.model_config.enable_sleep_mode: from vllm.device_allocator.cumem import CuMemAllocator From 43a6acfb7de8c7ad839d41bc2109fafe692b77ba Mon Sep 17 00:00:00 2001 From: CSWYF3634076 Date: Tue, 4 Nov 2025 15:16:46 +0800 Subject: [PATCH 114/976] [Model] fix ernie45 reasoning_parser (#27973) Signed-off-by: wangyafeng --- vllm/reasoning/ernie45_reasoning_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/reasoning/ernie45_reasoning_parser.py b/vllm/reasoning/ernie45_reasoning_parser.py index f9d4a30398cfd..8dfbcc0ce46bf 100644 --- a/vllm/reasoning/ernie45_reasoning_parser.py +++ b/vllm/reasoning/ernie45_reasoning_parser.py @@ -36,8 +36,8 @@ class Ernie45ReasoningParser(BaseThinkingReasoningParser): """The token that ends reasoning content.""" return "" - def __init__(self, tokenizer: PreTrainedTokenizerBase): - super().__init__(tokenizer) + def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs): + super().__init__(tokenizer, *args, **kwargs) if not self.model_tokenizer: raise ValueError( From 53f6e81dfd9cdba797ddade119a5e33389a35957 Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Mon, 3 Nov 2025 23:20:50 -0800 Subject: [PATCH 115/976] [CI/Build] Fix OpenAI API correctness on AMD CI (#28022) Signed-off-by: zhewenli --- .buildkite/test-amd.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index c023457fb03e4..5abf6122a5c39 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -629,15 +629,16 @@ steps: - label: OpenAI API correctness # 22min timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - csrc/ - vllm/entrypoints/openai/ - vllm/model_executor/models/whisper.py - commands: # LMEval+Transcription WER check - - pytest -s entrypoints/openai/correctness/ + commands: # LMEval + # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 + - pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py - label: OpenAI-Compatible Tool Use # 23 min timeout_in_minutes: 35 From 4022a9d279d09efe1b8a36ff3531bf1d4c8f08ca Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Tue, 4 Nov 2025 02:56:21 -0500 Subject: [PATCH 116/976] [BugFix][Performance] Restore flashinfer autotuning for all scenarios (#27904) --- tests/quantization/test_blackwell_moe.py | 16 ++--------- .../layers/fused_moe/trtllm_moe.py | 11 ++++++-- .../layers/quantization/mxfp4.py | 4 +-- vllm/model_executor/warmup/kernel_warmup.py | 27 +------------------ 4 files changed, 14 insertions(+), 44 deletions(-) diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py index 3cae6f46147bf..8dd4551ff4b96 100644 --- a/tests/quantization/test_blackwell_moe.py +++ b/tests/quantization/test_blackwell_moe.py @@ -172,21 +172,9 @@ def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT) -def test_gptoss_dp2_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1") - monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "deepep_high_throughput") +def test_gptoss_eager(monkeypatch: pytest.MonkeyPatch): can_initialize( "openai/gpt-oss-20b", - extra_args=["--data-parallel-size", "2", "--enable-expert-parallel"], - hf_overrides=HF_OVERRIDE_TEXT, - ) - - -def test_gptoss_dp2_mxfp4bf16_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1") - monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "deepep_high_throughput") - can_initialize( - "openai/gpt-oss-20b", - extra_args=["--data-parallel-size", "2", "--enable-expert-parallel"], hf_overrides=HF_OVERRIDE_TEXT, + extra_args=["--enforce-eager"], ) diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index e305483eb17db..132d35e65aba8 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -127,10 +127,17 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): "routing_method_type": 1, "do_finalize": True, "output": output, - "tune_max_num_tokens": self.max_capture_size, + "tune_max_num_tokens": max(self.max_capture_size, 1), } from flashinfer import trtllm_fp4_block_scale_routed_moe - trtllm_fp4_block_scale_routed_moe(**kwargs) + from vllm.utils.flashinfer import autotune + + with autotune(False): + # Enable autotune when, + # https://github.com/flashinfer-ai/flashinfer/issues/2023 is + # resolved. + trtllm_fp4_block_scale_routed_moe(**kwargs) + return output diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 597ee1b6bafe1..bf34ec0f38996 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1047,7 +1047,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): None, 1 if renormalize else 0, # routing_method_type, renormalize True, # do finalize - tune_max_num_tokens=self.max_capture_size, + tune_max_num_tokens=max(self.max_capture_size, 1), )[0] return trtllm_gen_output elif ( @@ -1122,7 +1122,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): tp_rank=self.moe.tp_rank, ep_size=self.moe.ep_size, ep_rank=self.moe.ep_rank, - tune_max_num_tokens=self.max_capture_size, + tune_max_num_tokens=max(self.max_capture_size, 1), **extra_kwargs, ) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index ffa3bc8f021ef..28792338f036f 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -11,7 +11,6 @@ from typing import TYPE_CHECKING import torch import vllm.envs as envs -from vllm.config import CUDAGraphMode, VllmConfig from vllm.logger import init_logger from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup from vllm.platforms import current_platform @@ -25,26 +24,6 @@ if TYPE_CHECKING: logger = init_logger(__name__) -def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool: - """ - Record known issues with vllm + flashinfer autotune here. Return True if - and only if flashinfer autotune will run through without issues. - """ - is_tp_or_dp = (vllm_config.parallel_config.data_parallel_size > 1) or ( - vllm_config.parallel_config.tensor_parallel_size > 1 - ) - is_fi_mxfp4_backend = ( - envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS - ) or ( - current_platform.is_cuda() and current_platform.is_device_capability(100) - ) # on >=sm100, default mxfp4 backend is flashinfer - is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE - - return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager) - - def kernel_warmup(worker: "Worker"): # Deep GEMM warmup do_deep_gemm_warmup = ( @@ -58,11 +37,7 @@ def kernel_warmup(worker: "Worker"): deep_gemm_warmup(model, max_tokens) # FlashInfer autotune for Hopper (SM 9.0) and Blackwell (SM 10.0) GPUs - if ( - has_flashinfer() - and current_platform.has_device_capability(90) - and flashinfer_autotune_supported(worker.vllm_config) - ): + if has_flashinfer() and current_platform.has_device_capability(90): flashinfer_autotune(worker.model_runner) # FlashInfer attention warmup From 2ec401bc39daf0c8daa7f7c6bffe4f5e15cb7c79 Mon Sep 17 00:00:00 2001 From: yugong333 Date: Tue, 4 Nov 2025 02:27:35 -0800 Subject: [PATCH 117/976] Load tuned fused_moe_lora shrink and expand kernel configs separately (#27435) Signed-off-by: Yu Gong Co-authored-by: Jee Jee Li --- benchmarks/kernels/benchmark_lora.py | 478 ++++++++++++++++-- tests/lora/test_fused_moe_lora_kernel.py | 11 + vllm/lora/layers/fused_moe.py | 103 +++- vllm/lora/ops/triton_ops/README_TUNING.md | 11 +- vllm/lora/ops/triton_ops/__init__.py | 9 +- vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 356 ++++++++++--- vllm/lora/ops/triton_ops/utils.py | 43 +- vllm/lora/punica_wrapper/punica_base.py | 3 +- vllm/lora/punica_wrapper/punica_gpu.py | 22 +- 9 files changed, 911 insertions(+), 125 deletions(-) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index bf1512268fe0b..6715c9b548aa1 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -19,13 +19,24 @@ from torch.utils.benchmark import Measurement as TMeasurement from utils import ArgPool, Bench, CudaGraphBenchParams from weight_shapes import WEIGHT_SHAPES -from vllm.triton_utils import HAS_TRITON +from vllm.lora.ops.triton_ops.utils import get_lora_op_configs +from vllm.triton_utils import HAS_TRITON, triton if HAS_TRITON: - from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink + from vllm.lora.ops.triton_ops import ( ## added fused_moe_lora + LoRAKernelMeta, + fused_moe_lora_expand, + fused_moe_lora_shrink, + lora_expand, + lora_shrink, + ) + from vllm.lora.ops.triton_ops.fused_moe_lora_op import ( + _LORA_PTR_DICT, ## added _LORA_PTR_DICT for fused_moe_lora + ) from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT - +from vllm import _custom_ops as ops from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.math_utils import round_up DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_TP_SIZES = [1] @@ -59,6 +70,8 @@ DEFAULT_NUM_LORAS = [1, 2, 3, 4] DEFAULT_SORT_BY_LORA_IDS = [False, True] DEFAULT_SEQ_LENGTHS = [1] DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False] +DEFAULT_TOP_K_NUMS = [1] # Added for MoE LoRA top_k +DEFAULT_NUM_EXPERTS = [8] # Added for MoE LoRA num_experts # Utilities @@ -191,6 +204,11 @@ class OpType(Enum): LORA_SHRINK = auto() LORA_EXPAND = auto() + ## Adding support for fused moe lora + FUSED_MOE_LORA_GATE_UP_SHRINK = auto() ## Gate/Up projection variant with shrink + FUSED_MOE_LORA_GATE_UP_EXPAND = auto() ## Gate/Up projection variant with expand + FUSED_MOE_LORA_DOWN_SHRINK = auto() ## Down projection variant with shrink + FUSED_MOE_LORA_DOWN_EXPAND = auto() ## Down projection variant with expand @staticmethod def from_str(s: str) -> "OpType": @@ -198,6 +216,15 @@ class OpType(Enum): return OpType.LORA_SHRINK if s.lower() == "lora_expand": return OpType.LORA_EXPAND + # Adding support for fused moe lora, both in gate_up and down + if s.lower() == "fused_moe_lora_gate_up_shrink": ## Gate/Up variant with shrink + return OpType.FUSED_MOE_LORA_GATE_UP_SHRINK + if s.lower() == "fused_moe_lora_gate_up_expand": ## Gate/Up variant with expand + return OpType.FUSED_MOE_LORA_GATE_UP_EXPAND + if s.lower() == "fused_moe_lora_down_shrink": ## Down variant with shrink + return OpType.FUSED_MOE_LORA_DOWN_SHRINK + if s.lower() == "fused_moe_lora_down_expand": ## Down variant with expand + return OpType.FUSED_MOE_LORA_DOWN_EXPAND raise ValueError(f"Unrecognized str {s} to convert to OpType") def is_shrink_fn(self) -> bool: @@ -206,19 +233,56 @@ class OpType(Enum): def is_expand_fn(self) -> bool: return self in [OpType.LORA_EXPAND] + def is_fused_moe_lora_fn(self) -> bool: ## adding for fused MoE LoRA + return self in [ + OpType.FUSED_MOE_LORA_GATE_UP_SHRINK, + OpType.FUSED_MOE_LORA_DOWN_SHRINK, + OpType.FUSED_MOE_LORA_GATE_UP_EXPAND, + OpType.FUSED_MOE_LORA_DOWN_EXPAND, + ] + + def is_fused_moe_lora_gate_up_fn( + self, + ) -> bool: ## adding for fused MoE LoRA Gate/Up + return self in [ + OpType.FUSED_MOE_LORA_GATE_UP_SHRINK, + OpType.FUSED_MOE_LORA_GATE_UP_EXPAND, + ] + + def is_fused_moe_lora_down_fn(self) -> bool: ## adding for fused MoE LoRA Down + return self in [ + OpType.FUSED_MOE_LORA_DOWN_SHRINK, + OpType.FUSED_MOE_LORA_DOWN_EXPAND, + ] + + def is_fused_moe_lora_shrink_fn(self) -> bool: + return self in [ + OpType.FUSED_MOE_LORA_GATE_UP_SHRINK, + OpType.FUSED_MOE_LORA_DOWN_SHRINK, + ] + + def is_fused_moe_lora_expand_fn(self) -> bool: + return self in [ + OpType.FUSED_MOE_LORA_GATE_UP_EXPAND, + OpType.FUSED_MOE_LORA_DOWN_EXPAND, + ] + def num_slices(self) -> list[int]: + if self.is_fused_moe_lora_gate_up_fn(): + return [2] + elif self.is_fused_moe_lora_down_fn(): + return [1] return [1, 2, 3] def mkn( self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int ) -> tuple[int, int, int]: num_tokens = batch_size * seq_length - if self.is_shrink_fn(): + if self.is_shrink_fn() or self.is_fused_moe_lora_fn(): m = num_tokens k = hidden_size n = lora_rank - else: - assert self.is_expand_fn() + elif self.is_expand_fn(): m = num_tokens k = lora_rank n = hidden_size @@ -232,9 +296,36 @@ class OpType(Enum): """ if self.is_shrink_fn(): return op_dtype, op_dtype, torch.float32 - else: - assert self.is_expand_fn() + elif self.is_expand_fn(): return torch.float32, op_dtype, op_dtype + else: + assert self.is_fused_moe_lora_fn() + return op_dtype, op_dtype, op_dtype + + def matmul_shapes_fused_moe_lora( + self, + m: int, + n: int, + k: int, + num_loras: int, + num_slices: int, + top_k_num: int, + num_experts: int, + ) -> tuple[tuple[int], tuple[int], tuple[int], tuple[int]]: + if self.is_fused_moe_lora_shrink_fn(): + input_shape = ( + (m * top_k_num, n) + if self in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] + else (m, n) + ) + output_shape = (num_slices, m, top_k_num, k) + weight_shape = (num_loras, num_experts, k, n) + else: + assert self.is_fused_moe_lora_expand_fn() + input_shape = (num_slices, m, top_k_num, k) + output_shape = (m, top_k_num, n * num_slices) + weight_shape = (num_loras, num_experts, n, k) + return (input_shape, weight_shape, output_shape) def matmul_shapes( self, @@ -244,6 +335,8 @@ class OpType(Enum): lora_rank: int, num_loras: int, num_slices: int, + top_k_num: int | None = None, + num_experts: int | None = None, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: """ Given num_slices, return the shapes of the A, B, and C matrices @@ -258,6 +351,16 @@ class OpType(Enum): if self in [OpType.LORA_EXPAND]: # LoRA expand kernels support num_slices inherently in the kernel return ((num_slices, m, k), b_shape, (m, n * num_slices)) + if self.is_fused_moe_lora_fn(): + return self.matmul_shapes_fused_moe_lora( + m, + k, + n, + num_loras, + num_slices, + top_k_num, + num_experts, + ) raise ValueError(f"Unrecognized op_type {self}") def bench_fn(self) -> Callable: @@ -265,6 +368,16 @@ class OpType(Enum): return lora_shrink if self == OpType.LORA_EXPAND: return lora_expand + if self in [ + OpType.FUSED_MOE_LORA_GATE_UP_SHRINK, + OpType.FUSED_MOE_LORA_DOWN_SHRINK, + ]: + return fused_moe_lora_shrink + if self in [ + OpType.FUSED_MOE_LORA_GATE_UP_EXPAND, + OpType.FUSED_MOE_LORA_DOWN_EXPAND, + ]: + return fused_moe_lora_expand raise ValueError(f"Unrecognized optype {self}") @@ -318,6 +431,8 @@ class BenchmarkContext: sort_by_lora_id: bool dtype: torch.dtype seq_length: int | None = None + num_experts: int | None = None # num_experts for MoE based ops + top_k_num: int | None = None # top_k for MoE based ops num_slices: int | None = None # num_slices for slice based ops def with_seq_length(self, seq_length: int) -> "BenchmarkContext": @@ -373,6 +488,11 @@ class BenchmarkTensors: f"{dtype_to_str(self.output.dtype)}" ) + def get_num_tokens(self, size: int, top_k_num: int, op_type: OpType): + return ( + size * top_k_num if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] else size + ) + @staticmethod def make( ctx: BenchmarkContext, op_type: OpType, device: str = "cuda" @@ -385,6 +505,8 @@ class BenchmarkTensors: ctx.lora_rank, ctx.num_loras, ctx.num_slices, + ctx.top_k_num, + ctx.num_experts, ) a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype) input_tensor, lora_weights, output_tensor = make_rand_tensors( @@ -432,17 +554,27 @@ class BenchmarkTensors: prompt_lora_indices_tensor, ) - def sanity_check(self) -> None: + def sanity_check(self, ctx: BenchmarkContext, op_type: OpType) -> None: """ Fails asserts when non-conformality is detected. """ - num_tokens = self.input.shape[-2] + num_tokens = ( + self.input.shape[1] + if op_type.is_fused_moe_lora_expand_fn() + else self.input.shape[-2] + ) # check metadata tensors - assert torch.sum(self.seq_lens) == num_tokens + ## In down shrink case, each token is repeated top_k_num times + assert num_tokens == self.get_num_tokens( + torch.sum(self.seq_lens), ctx.top_k_num, op_type + ), f"Expected {num_tokens} tokens, but got {torch.sum(self.seq_lens)}" num_seqs = self.seq_lens.shape[0] # assert self.seq_start_loc.shape[0] == num_seqs + ## In down shrink case, each prompt corresponds to top_k_num sequences assert self.prompt_lora_mapping.shape[0] == num_seqs - assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens + assert self.get_num_tokens( + self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type + ) def to_device(self, device: str): """ @@ -471,21 +603,111 @@ class BenchmarkTensors: to_device(field) if field_name != "no_lora_flag_cpu" else field, ) - def metadata(self) -> tuple[int, int, int]: + def metadata(self, ctx: BenchmarkContext, op_type: OpType) -> tuple[int, int, int]: """ Return num_seqs, num_tokens and max_seq_len """ num_seqs = self.seq_lens.shape[0] - num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0] + num_tokens = self.get_num_tokens( + self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type + ) max_seq_len = torch.max(self.seq_lens).item() num_slices = len(self.lora_weights_lst) return num_seqs, num_tokens, max_seq_len, num_slices - def as_lora_shrink_kwargs(self) -> dict[str, Any]: - self.sanity_check() + def fused_moe_lora_data_prepare( + self, + block_size: int, + token_lora_mapping: torch.Tensor, + ctx: BenchmarkContext, + ): + def moe_lora_align_block_size( + topk_ids: torch.Tensor, + token_lora_mapping: torch.Tensor, + block_size: int, + num_experts: int, + max_loras: int, + expert_map: torch.Tensor | None = None, + pad_sorted_ids: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Aligns tokens and experts into block-sized chunks for LoRA-based + mixture-of-experts (MoE) execution. + """ + max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) + if pad_sorted_ids: + max_num_tokens_padded = round_up(max_num_tokens_padded, block_size) + sorted_ids = torch.empty( + (max_loras * max_num_tokens_padded,), + dtype=torch.int32, + device=topk_ids.device, + ) + max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size) + # Expert ids must be set default to -1 to prevent a blank block + expert_ids = torch.empty( + (max_loras * max_num_m_blocks,), + dtype=torch.int32, + device=topk_ids.device, + ) + num_tokens_post_pad = torch.empty( + (max_loras), dtype=torch.int32, device=topk_ids.device + ) + + ops.moe_lora_align_block_size( + topk_ids, + token_lora_mapping, + num_experts, + block_size, + max_loras, + max_num_tokens_padded, + max_num_m_blocks, + sorted_ids, + expert_ids, + num_tokens_post_pad, + ) + if expert_map is not None: + expert_ids = expert_map[expert_ids] + + return sorted_ids, expert_ids, num_tokens_post_pad + + num_tokens = ctx.batch_size + curr_topk_ids = torch.randint( + 0, + ctx.num_experts, + (num_tokens, ctx.top_k_num), + device="cuda", + dtype=torch.int32, + ) + topk_weights = torch.randint( + 0, + ctx.num_experts, + (num_tokens, ctx.top_k_num), + device="cuda", + dtype=torch.int32, + ) + + (sorted_token_ids_lora, expert_ids_lora, num_tokens_post_padded_lora) = ( + moe_lora_align_block_size( + topk_ids=curr_topk_ids, + token_lora_mapping=token_lora_mapping, + block_size=block_size, + num_experts=ctx.num_experts, + max_loras=ctx.num_loras, + ) + ) + + sorted_token_ids = sorted_token_ids_lora.view(ctx.num_loras, -1) + expert_ids = expert_ids_lora.view(ctx.num_loras, -1) + num_tokens_post_padded = num_tokens_post_padded_lora + return (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) + + def as_lora_shrink_kwargs( + self, ctx: BenchmarkContext, op_type: OpType + ) -> dict[str, Any]: + self.sanity_check(ctx, op_type) self.to_device(self.input.device) - _, num_tokens, _, num_slices = self.metadata() + _, num_tokens, _, num_slices = self.metadata(ctx, op_type) # Sanity check matrix shapes. i_shape, lw_shape, o_shape = ( @@ -520,11 +742,13 @@ class BenchmarkTensors: "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, } - def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]: - self.sanity_check() + def as_lora_expand_kwargs( + self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool + ) -> dict[str, Any]: + self.sanity_check(ctx, op_type) self.to_device(self.input.device) - _, num_tokens, _, num_slices = self.metadata() + _, num_tokens, _, num_slices = self.metadata(ctx, op_type) # Sanity check matrix shapes. i_shape, lw_shape, o_shape = ( @@ -561,18 +785,173 @@ class BenchmarkTensors: "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu, } - def bench_fn_kwargs( - self, op_type: OpType, add_inputs: bool | None = None + def as_fused_moe_lora_shrink_kwargs( + self, ctx: BenchmarkContext, op_type: OpType ) -> dict[str, Any]: - if op_type.is_shrink_fn(): + self.sanity_check(ctx, op_type) + self.to_device(self.input.device) + + _, num_tokens, _, num_slices = self.metadata(ctx, op_type) + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = ( + self.input.shape, + self.lora_weights_lst[0].shape, + self.output.shape, + ) + # Expected input shape : [num_tokens, hidden_size] for gate_up + # Expected input shape : [top_k_num * num_tokens, hidden_size] for down + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [max_lora, num_experts, lora_rank, hidden_size] + assert len(lw_shape) == 4 + assert lw_shape[-1] == hidden_size + lora_rank = lw_shape[-2] + # Expected output shape : [num_slices, num_tokens, top_k_num, lora_rank] + assert len(o_shape) == 4 + assert ( + o_shape + == (num_slices, num_tokens // ctx.top_k_num, ctx.top_k_num, lora_rank) + if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] + else o_shape == (num_slices, num_tokens, ctx.top_k_num, lora_rank) + ) + kernel_config = get_lora_op_configs( + op_type.name.lower(), + max_loras=lw_shape[0], + batch=num_tokens, + hidden_size=hidden_size, + rank=lora_rank, + num_slices=num_slices, + add_inputs=False, + ) + + (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = ( + self.fused_moe_lora_data_prepare( + block_size=kernel_config["BLOCK_SIZE_M"], + token_lora_mapping=self.lora_kernel_meta.token_lora_mapping, + ctx=ctx, + ) + ) + + return { + "qcurr_hidden_states": self.input, + "lora_a_stacked": self.lora_weights_lst, + "a_intermediate_cache1": self.output, + "topk_weights": topk_weights, + "sorted_token_ids": sorted_token_ids, + "expert_ids": expert_ids, + "num_tokens_post_padded": num_tokens_post_padded, + "top_k_num": ctx.top_k_num, + "device": self.input.device, + "N": lora_rank, + "M": topk_weights.shape[0], + "EM": sorted_token_ids.shape[1], + "K": self.input.shape[1], + "num_tokens": num_tokens, + "num_experts": ctx.num_experts, + "num_slices": num_slices, + "shrink_block_size_m": kernel_config["BLOCK_SIZE_M"], + "shrink_block_size_n": kernel_config["BLOCK_SIZE_N"], + "shrink_block_size_k": kernel_config["BLOCK_SIZE_K"], + "shrink_group_size_m": kernel_config["GROUP_SIZE_M"], + "shrink_num_warps": kernel_config["NUM_WARPS"], + "shrink_num_stages": kernel_config["NUM_STAGES"], + "shrink_split_k": kernel_config.get("SPLIT_K", 1), + "mul_routed_weight": op_type.is_fused_moe_lora_down_fn(), + } + + def as_fused_moe_lora_expand_kwargs( + self, ctx: BenchmarkContext, op_type: OpType + ) -> dict[str, Any]: + self.sanity_check(ctx, op_type) + self.to_device(self.input.device) + + _, num_tokens, _, num_slices = self.metadata(ctx, op_type) + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = ( + self.input.shape, + self.lora_weights_lst[0].shape, + self.output.shape, + ) + + # Expected input shape : [num_slices, num_tokens, top_k_num, lora_rank] + assert len(i_shape) == 4 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[-1] + # Expected lora weight shape : [num_loras, num_experts, hidden_size, lora_rank] + assert len(lw_shape) == 4 + assert lw_shape[-1] == lora_rank + hidden_size = lw_shape[-2] + # Expected output shape : [num_tokens, top_k_num, hidden_size * num_slices] + assert len(o_shape) == 3 + assert o_shape == (num_tokens, ctx.top_k_num, hidden_size * num_slices) + + kernel_config = get_lora_op_configs( + op_type.name.lower(), + max_loras=lw_shape[0], + batch=num_tokens, + hidden_size=hidden_size, + rank=lora_rank, + num_slices=num_slices, + add_inputs=False, + ) + + (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = ( + self.fused_moe_lora_data_prepare( + block_size=kernel_config["BLOCK_SIZE_M"], + token_lora_mapping=self.lora_kernel_meta.token_lora_mapping, + ctx=ctx, + ) + ) + + return { + "a_intermediate_cache1": self.input, + "lora_b_stacked": self.lora_weights_lst, + "output": self.output, + "topk_weights": topk_weights, + "sorted_token_ids": sorted_token_ids, + "expert_ids": expert_ids, + "num_tokens_post_padded": num_tokens_post_padded, + "top_k_num": ctx.top_k_num, + "device": self.input.device, + "N": lora_rank, + "M": topk_weights.shape[0], + "EM": sorted_token_ids.shape[1], + "K": self.input.shape[1], + "num_tokens": num_tokens, + "num_experts": ctx.num_experts, + "num_slices": num_slices, + "max_lora_rank": lora_rank, + "w1_output_dim_size": lw_shape[2], + "expand_block_size_m": kernel_config["BLOCK_SIZE_M"], + "expand_block_size_n": kernel_config["BLOCK_SIZE_N"], + "expand_block_size_k": kernel_config["BLOCK_SIZE_K"], + "expand_group_size_m": kernel_config["GROUP_SIZE_M"], + "expand_num_warps": kernel_config["NUM_WARPS"], + "expand_num_stages": kernel_config["NUM_STAGES"], + "expand_split_k": kernel_config.get("SPLIT_K", 1), + "mul_routed_weight": op_type.is_fused_moe_lora_down_fn(), + } + + def bench_fn_kwargs( + self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool | None = None + ) -> dict[str, Any]: + if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn(): assert add_inputs is None else: assert add_inputs is not None if op_type == OpType.LORA_SHRINK: - return self.as_lora_shrink_kwargs() + return self.as_lora_shrink_kwargs(ctx, op_type) if op_type == OpType.LORA_EXPAND: - return self.as_lora_expand_kwargs(add_inputs) + return self.as_lora_expand_kwargs(ctx, op_type, add_inputs) + if op_type.is_fused_moe_lora_shrink_fn(): + return self.as_fused_moe_lora_shrink_kwargs(ctx, op_type) + if op_type.is_fused_moe_lora_expand_fn(): + return self.as_fused_moe_lora_expand_kwargs(ctx, op_type) raise ValueError(f"Unrecognized optype {self}") def test_correctness( @@ -617,7 +996,7 @@ def bench_optype( test_correctness: bool = False, ) -> TMeasurement: assert arg_pool_size >= 1 - if op_type.is_shrink_fn(): + if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn(): assert expand_fn_add_inputs is None else: assert expand_fn_add_inputs is not None @@ -627,23 +1006,30 @@ def bench_optype( BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size) ] for bt in bench_tensors: - bt.sanity_check() + bt.sanity_check(ctx, op_type) # Test correctness of our implementation. if test_correctness: + assert op_type in [OpType.LORA_SHRINK, OpType.LORA_EXPAND], ( + f"Correctness testing is not supported for {op_type.name}." + ) assert all( - [bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors] + [ + bt.test_correctness(ctx, op_type, expand_fn_add_inputs) + for bt in bench_tensors + ] ) # BenchmarkTensors -> dict (kwargs) kwargs_list = [ - bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) + bt.bench_fn_kwargs(ctx, op_type, add_inputs=expand_fn_add_inputs) for bt in bench_tensors ] # Clear LoRA optimization hash-maps. _LORA_A_PTR_DICT.clear() _LORA_B_PTR_DICT.clear() + _LORA_PTR_DICT.clear() # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up for kwargs in kwargs_list: op_type.bench_fn()(**kwargs) @@ -793,7 +1179,9 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]): # Benchmark bench_op expand_fn_add_inputs = ( - [None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs + [None] + if bench_op.is_shrink_fn() or bench_op.is_fused_moe_lora_fn() + else args.expand_fn_add_inputs ) for add_input_arg in expand_fn_add_inputs: seq_len_timers.append( @@ -831,12 +1219,22 @@ def as_benchmark_contexts( hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace ) -> list[BenchmarkContext]: ctxs: list[BenchmarkContext] = [] - for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa + for ( + batch_size, + hidden_size, + lora_rank, + num_loras, + sort_by_lora_id, + top_k_num, + num_experts, + ) in product( # noqa args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras, args.sort_by_lora_id, + args.top_k_nums, + args.num_experts, ): ctxs.append( BenchmarkContext( @@ -851,6 +1249,8 @@ def as_benchmark_contexts( seq_length=None, sort_by_lora_id=sort_by_lora_id, dtype=args.dtype, + top_k_num=top_k_num, + num_experts=num_experts, # To be filled based on the OpType to benchmark num_slices=None, ) @@ -1012,6 +1412,22 @@ if __name__ == "__main__": ), ) + p.add_argument( + "--top-k-nums", + nargs="+", + type=int, + default=DEFAULT_TOP_K_NUMS, + help="Top-K values for MoE LoRA operations", + ) + + p.add_argument( + "--num-experts", + nargs="+", + type=int, + default=DEFAULT_NUM_EXPERTS, + help="Number of experts for MoE LoRA operations", + ) + parser = FlexibleArgumentParser( description=f""" Benchmark LoRA kernels: diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py index 318a0e58805d3..91ab4a87c65f8 100644 --- a/tests/lora/test_fused_moe_lora_kernel.py +++ b/tests/lora/test_fused_moe_lora_kernel.py @@ -158,6 +158,8 @@ def use_fused_moe_lora_kernel( "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, + "NUM_WARPS": 4, + "NUM_STAGES": 3, "SPLIT_K": 1, } @@ -182,6 +184,15 @@ def use_fused_moe_lora_kernel( config["BLOCK_SIZE_N"], config["BLOCK_SIZE_K"], config["GROUP_SIZE_M"], + config["NUM_WARPS"], + config["NUM_STAGES"], + config["SPLIT_K"], + config["BLOCK_SIZE_M"], + config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], + config["GROUP_SIZE_M"], + config["NUM_WARPS"], + config["NUM_STAGES"], config["SPLIT_K"], mul_routed_weight, ) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 7711f5c3208bc..f5a766dd5e45a 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -13,6 +13,7 @@ from vllm.distributed.parallel_state import ( get_tensor_model_parallel_world_size, ) from vllm.lora.layers.base import BaseLayerWithLoRA +from vllm.lora.ops.triton_ops.utils import get_lora_op_configs from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import ( _get_config_dtype_str, @@ -39,6 +40,64 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.device = base_layer.w2_weight.device self._inject_lora_into_fused_moe() + def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]: + normalized_config = {} + for key, value in config.items(): + if key.islower(): + if key.startswith("block_"): + normalized_key = "BLOCK_SIZE_" + key.split("_")[-1].upper() + else: + normalized_key = key.upper() + else: + normalized_key = key + normalized_config[normalized_key] = value + return normalized_config + + def _get_lora_moe_configs( + self, + op_prefix: str, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + num_slices: int, + M: int, + layer: FusedMoE, + top_k: int, + config_dtype: str, + ): + if envs.VLLM_TUNED_CONFIG_FOLDER: + shrink_config = get_lora_op_configs( + op_type=f"fused_moe_lora_{op_prefix}_shrink", + max_loras=lora_a_stacked.shape[0], + batch=M, + hidden_size=lora_a_stacked.shape[-1], + rank=lora_a_stacked.shape[-2], + num_slices=num_slices, + moe_intermediate_size=lora_b_stacked.shape[-2], + ) + expand_config = get_lora_op_configs( + op_type=f"fused_moe_lora_{op_prefix}_expand", + max_loras=lora_a_stacked.shape[0], + batch=M, + hidden_size=lora_a_stacked.shape[-1], + rank=lora_a_stacked.shape[-2], + num_slices=num_slices, + moe_intermediate_size=lora_b_stacked.shape[-2], + ) + else: # fall back to the default config + get_config_func = functools.partial( + try_get_optimal_moe_config, + layer.w13_weight.size(), + layer.w2_weight.size(), + top_k, + config_dtype, + block_shape=layer.quant_method.moe_quant_config.block_shape, + ) + shrink_config = get_config_func(M) + expand_config = get_config_func(M) + shrink_config = self._normalize_keys(shrink_config) + expand_config = self._normalize_keys(expand_config) + return shrink_config, expand_config + def _inject_lora_into_fused_moe(self): moe_state_dict = {} top_k = self.base_layer.top_k @@ -90,17 +149,19 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): num_tokens = hidden_states.size(0) M = min(num_tokens, CHUNK_SIZE) - get_config_func = functools.partial( - try_get_optimal_moe_config, - layer.w13_weight.size(), - layer.w2_weight.size(), - top_k, - config_dtype, - block_shape=layer.quant_method.moe_quant_config.block_shape, + shrink_config, expand_config = self._get_lora_moe_configs( + op_prefix="w13", + lora_a_stacked=self.w1_lora_a_stacked, + lora_b_stacked=self.w1_lora_b_stacked, + num_slices=2, + M=M, + layer=layer, + top_k=top_k, + config_dtype=config_dtype, ) + # get the block size of m from customized config or default config max_loras = self.w1_lora_a_stacked.shape[0] - config = get_config_func(M) ( sorted_token_ids_lora, expert_ids_lora, @@ -108,7 +169,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): ) = self.punica_wrapper.moe_lora_align_block_size( curr_topk_ids, num_tokens, - config["BLOCK_SIZE_M"], + shrink_config["BLOCK_SIZE_M"], self.base_layer.local_num_experts, max_loras, self.adapter_enabled, @@ -138,7 +199,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): num_tokens_post_padded_lora, max_lora_rank, top_k, - config, + shrink_config, ## pass the shrink config + expand_config, ## pass the expand config self.adapter_enabled, ) @@ -164,17 +226,17 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): num_tokens = hidden_states.size(0) M = min(num_tokens, CHUNK_SIZE) - get_config_func = functools.partial( - try_get_optimal_moe_config, - layer.w13_weight.size(), - layer.w2_weight.size(), - top_k, - config_dtype, - block_shape=layer.quant_method.moe_quant_config.block_shape, + shrink_config, expand_config = self._get_lora_moe_configs( + op_prefix="w2", + lora_a_stacked=self.w2_lora_a_stacked, + lora_b_stacked=self.w2_lora_b_stacked, + num_slices=1, + M=M, + layer=layer, + top_k=top_k, + config_dtype=config_dtype, ) - config = get_config_func(M) - sorted_token_ids_lora = moe_state_dict["sorted_token_ids_lora"] expert_ids_lora = moe_state_dict["expert_ids_lora"] num_tokens_post_padded_lora = moe_state_dict[ @@ -197,7 +259,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): num_tokens_post_padded_lora, max_lora_rank, top_k, - config, + shrink_config, ## pass the shrink config + expand_config, ## pass the expand config self.adapter_enabled, True, ) diff --git a/vllm/lora/ops/triton_ops/README_TUNING.md b/vllm/lora/ops/triton_ops/README_TUNING.md index fda95ea71891f..d576e261557a4 100644 --- a/vllm/lora/ops/triton_ops/README_TUNING.md +++ b/vllm/lora/ops/triton_ops/README_TUNING.md @@ -44,8 +44,17 @@ For `shrink`, the config file is named as `{gpu_name}_SHRINK.json`, e.g. `NVIDIA For `expand`, the config fileis named as `{gpu_name}_EXPAND_{add_input}.json`, e.g. `NVIDIA_H200_EXPAND_TRUE.json`. +For `fused_moe_lora_w13_shrink`, the config file is named as `{gpu_name}_FUSED_MOE_LORA_W13_SHRINK.json`, e.g. `NVIDIA_H200_FUSED_MOE_LORA_W13_SHRINK.json`. + +For `fused_moe_lora_w13_expand`, the config file is named as `{gpu_name}_FUSED_MOE_LORA_W13_EXPAND.json`, e.g. `NVIDIA_H200_FUSED_MOE_LORA_W13_EXPAND.json`. + +For `fused_moe_lora_w2_shrink`, the config file is named as `{gpu_name}_FUSED_MOE_LORA_W2_SHRINK.json`, e.g. `NVIDIA_H200_FUSED_MOE_LORA_W2_SHRINK.json`. + +For `fused_moe_lora_w2_expand`, the config file is named as `{gpu_name}_FUSED_MOE_LORA_W2_EXPAND.json`, e.g. `NVIDIA_H200_FUSED_MOE_LORA_W2_EXPAND.json`. + The `gpu_name` can be automatically detected by calling `torch.cuda.get_device_name()` ### Json Structure -Optimal kernel configuration files are saved as JSON files with the structure `config_data[max_loras][num_slices][m][k][n]` +Optimal kernel configuration files are saved as JSON files with the structure `config_data[max_loras][num_slices][m][k][n][i]` +where `i` is an optional dimension in the `fused_moe_lora` configuration, representing the intermediate size of the MoE layer. diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py index 436ea4ed00c82..7e8b9a79add39 100644 --- a/vllm/lora/ops/triton_ops/__init__.py +++ b/vllm/lora/ops/triton_ops/__init__.py @@ -1,7 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.lora.ops.triton_ops.fused_moe_lora_op import fused_moe_lora + +from vllm.lora.ops.triton_ops.fused_moe_lora_op import ( + fused_moe_lora, + fused_moe_lora_expand, + fused_moe_lora_shrink, +) from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta from vllm.lora.ops.triton_ops.lora_shrink_op import lora_shrink @@ -11,4 +16,6 @@ __all__ = [ "lora_shrink", "LoRAKernelMeta", "fused_moe_lora", + "fused_moe_lora_shrink", + "fused_moe_lora_expand", ] diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index 539605c7c534a..8f85f926aa4f1 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -176,88 +176,50 @@ def _fused_moe_lora_kernel( @torch.inference_mode() -def _fused_moe_lora( - output: torch.Tensor, # (num_tokens, top_k_num, N*len(lora_a_stacked),) +def _fused_moe_lora_shrink( + a_intermediate_cache1: torch.Tensor, + # (num_slices, num_tokens, top_k_num, max_lora_rank) qcurr_hidden_states: torch.Tensor, # (num_tokens, K,) lora_a_stacked: list[ torch.Tensor ], # [(max_loras, num_experts, max_lora_rank, K,),...] - lora_b_stacked: list[ - torch.Tensor - ], # [(max_loras, num_experts, N, max_lora_rank,),...] topk_weights: torch.Tensor, # (num_tokens, top_k_num) sorted_token_ids: torch.Tensor, # (max_loras, _) expert_ids: torch.Tensor, # (max_loras, _ ,) num_tokens_post_padded: torch.Tensor, # (max_loras, ) - max_lora_rank: int, top_k_num: int, lora_ids: torch.Tensor, adapter_enabled: torch.Tensor, + ## adding for kernel + device: torch.device, + N: int, + M: int, + EM: int, + K: int, + num_tokens: int, + num_experts: int, + num_slices: int, block_size_m: int, block_size_n: int, block_size_k: int, group_size_m: int, + num_warps: int, + num_stages: int, split_k: int, mul_routed_weight: bool = False, ) -> None: - assert len(lora_a_stacked) == len(lora_b_stacked) > 0 - assert ( - sorted_token_ids.dim() - == expert_ids.dim() - == topk_weights.dim() - == qcurr_hidden_states.dim() - == 2 - ) - assert ( - sorted_token_ids.shape[0] - == expert_ids.shape[0] - == num_tokens_post_padded.shape[0] - ) - assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1] - assert output.shape[0] == topk_weights.shape[0] - assert top_k_num == topk_weights.shape[1] + w1_lora_a_stacked = lora_a_stacked[0] - for lora_a, lora_b in zip(lora_a_stacked, lora_b_stacked): - assert lora_a.dtype == lora_b.dtype == output.dtype == qcurr_hidden_states.dtype - assert lora_a.dtype in [torch.float16, torch.bfloat16] - - device = qcurr_hidden_states.device - num_slices = len(lora_a_stacked) - - config = { + shrink_config = { "BLOCK_SIZE_M": block_size_m, "BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k, "GROUP_SIZE_M": group_size_m, + "num_warps": num_warps, + "num_stages": num_stages, "SPLIT_K": split_k, } - w1_lora_a_stacked = lora_a_stacked[0] - w1_lora_b_stacked = lora_b_stacked[0] - num_experts = lora_a_stacked[0].shape[1] - - N = max_lora_rank - M = topk_weights.shape[0] - EM = sorted_token_ids.shape[1] - K = qcurr_hidden_states.shape[1] - num_tokens = M * top_k_num - w1_output_dim_size = w1_lora_b_stacked.shape[2] - - lora_intermediate_cache1 = torch.zeros( - (num_slices * M * top_k_num * (max_lora_rank + w1_output_dim_size)), - dtype=output.dtype, - device=device, - ) - - # slices - a_intermediate_size = num_slices * M * top_k_num * max_lora_rank - a_intermediate_cache1 = lora_intermediate_cache1[:a_intermediate_size].view( - num_slices, M, top_k_num, max_lora_rank - ) - b_intermediate_cache1 = lora_intermediate_cache1[a_intermediate_size:].view( - num_slices, M, top_k_num, w1_output_dim_size - ) - b_ptr = _get_ptr(lora_a_stacked, device) grid = lambda META: ( @@ -299,19 +261,70 @@ def _fused_moe_lora( num_slice_c=num_slices, top_k=1 if mul_routed_weight else top_k_num, MUL_ROUTED_WEIGHT=False, - **config, + **shrink_config, ) + +@torch.inference_mode() +def _fused_moe_lora_expand( + output: torch.Tensor, # (num_tokens, top_k_num, N*len(lora_a_stacked),) + a_intermediate_cache1: torch.Tensor, # (num_slices, M, top_k_num, max_lora_rank) + lora_b_stacked: list[ + torch.Tensor + ], # [(max_loras, num_experts, max_lora_rank, K,),...] + topk_weights: torch.Tensor, # (num_tokens, top_k_num) + sorted_token_ids: torch.Tensor, # (max_loras, _) + expert_ids: torch.Tensor, # (max_loras, _ ,) + num_tokens_post_padded: torch.Tensor, # (max_loras, ) + top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, + ## adding for kernel + device: torch.device, + N: int, + M: int, + EM: int, + K: int, + num_tokens: int, + num_experts: int, + num_slices: int, + max_lora_rank: int, + w1_output_dim_size: int, + block_size_m: int, + block_size_n: int, + block_size_k: int, + group_size_m: int, + num_warps: int, + num_stages: int, + split_k: int, + mul_routed_weight: bool = False, +) -> None: b_ptr = _get_ptr(lora_b_stacked, device) K = max_lora_rank N = w1_output_dim_size + w1_lora_b_stacked = lora_b_stacked[0] + a_intermediate_cache1 = a_intermediate_cache1.view( -1, a_intermediate_cache1.shape[3] ) - # Set split_k = 1 for expand calls - config["SPLIT_K"] = 1 + b_intermediate_cache1 = torch.zeros( + (num_slices, M, top_k_num, w1_output_dim_size), + dtype=output.dtype, + device=device, + ) + + expand_config = { + "BLOCK_SIZE_M": block_size_m, + "BLOCK_SIZE_N": block_size_n, + "BLOCK_SIZE_K": block_size_k, + "GROUP_SIZE_M": group_size_m, + "num_warps": num_warps, + "num_stages": num_stages, + "SPLIT_K": split_k, # Set split_k = 1 for expand calls + } + grid = lambda META: ( triton.cdiv(EM, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]), len(lora_b_stacked), @@ -348,12 +361,142 @@ def _fused_moe_lora( num_slice_c=num_slices, top_k=1, MUL_ROUTED_WEIGHT=mul_routed_weight, - **config, + **expand_config, ) for i in range(num_slices): output[:, :, i * N : (i + 1) * N] += b_intermediate_cache1[i] +@torch.inference_mode() +def _fused_moe_lora( + output: torch.Tensor, # (num_tokens, top_k_num, N*len(lora_a_stacked),) + qcurr_hidden_states: torch.Tensor, # (num_tokens, K,) + lora_a_stacked: list[ + torch.Tensor + ], # [(max_loras, num_experts, max_lora_rank, K,),...] + lora_b_stacked: list[ + torch.Tensor + ], # [(max_loras, num_experts, N, max_lora_rank,),...] + topk_weights: torch.Tensor, # (num_tokens, top_k_num) + sorted_token_ids: torch.Tensor, # (max_loras, _) + expert_ids: torch.Tensor, # (max_loras, _ ,) + num_tokens_post_padded: torch.Tensor, # (max_loras, ) + max_lora_rank: int, + top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, + shrink_block_size_m: int, + shrink_block_size_n: int, + shrink_block_size_k: int, + shrink_group_size_m: int, + shrink_num_warps: int, + shrink_num_stages: int, + shrink_split_k: int, + expand_block_size_m: int, + expand_block_size_n: int, + expand_block_size_k: int, + expand_group_size_m: int, + expand_num_warps: int, + expand_num_stages: int, + expand_split_k: int, + mul_routed_weight: bool = False, +) -> None: + assert len(lora_a_stacked) == len(lora_b_stacked) > 0 + assert ( + sorted_token_ids.dim() + == expert_ids.dim() + == topk_weights.dim() + == qcurr_hidden_states.dim() + == 2 + ) + assert ( + sorted_token_ids.shape[0] + == expert_ids.shape[0] + == num_tokens_post_padded.shape[0] + ) + assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1] + assert output.shape[0] == topk_weights.shape[0] + assert top_k_num == topk_weights.shape[1] + device = qcurr_hidden_states.device + num_slices = len(lora_a_stacked) + w1_lora_b_stacked = lora_b_stacked[0] + num_experts = lora_a_stacked[0].shape[1] + N = max_lora_rank + M = topk_weights.shape[0] + EM = sorted_token_ids.shape[1] + K = qcurr_hidden_states.shape[1] + num_tokens = M * top_k_num + w1_output_dim_size = w1_lora_b_stacked.shape[2] + + a_intermediate_cache1 = torch.zeros( + (num_slices, M, top_k_num, max_lora_rank), + dtype=output.dtype, + device=device, + ) + + _fused_moe_lora_shrink( + a_intermediate_cache1, + qcurr_hidden_states, + lora_a_stacked, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + top_k_num, + lora_ids, + adapter_enabled, + ## adding for kernel + device, + N, + M, + EM, + K, + num_tokens, + num_experts, + num_slices, + shrink_block_size_m, + shrink_block_size_n, + shrink_block_size_k, + shrink_group_size_m, + shrink_num_warps, + shrink_num_stages, + shrink_split_k, + mul_routed_weight, + ) + + _fused_moe_lora_expand( + output, + a_intermediate_cache1, + lora_b_stacked, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + top_k_num, + lora_ids, + adapter_enabled, + ## adding for kernel + device, + N, + M, + EM, + K, + num_tokens, + num_experts, + num_slices, + max_lora_rank, + w1_output_dim_size, + expand_block_size_m, + expand_block_size_n, + expand_block_size_k, + expand_group_size_m, + expand_num_warps, + expand_num_stages, + expand_split_k, + mul_routed_weight, + ) + + def _fused_moe_lora_fake( output: torch.Tensor, qcurr_hidden_states: torch.Tensor, @@ -367,10 +510,84 @@ def _fused_moe_lora_fake( top_k_num: int, lora_ids: torch.Tensor, adapter_enabled: torch.Tensor, + shrink_block_size_m: int, + shrink_block_size_n: int, + shrink_block_size_k: int, + shrink_group_size_m: int, + shrink_num_warps: int, + shrink_num_stages: int, + shrink_split_k: int, + expand_block_size_m: int, + expand_block_size_n: int, + expand_block_size_k: int, + expand_group_size_m: int, + expand_num_warps: int, + expand_num_stages: int, + expand_split_k: int, + mul_routed_weight: bool = False, +) -> None: + return + + +def _fused_moe_lora_shrink_fake( + a_intermediate_cache1: torch.Tensor, + qcurr_hidden_states: torch.Tensor, + lora_a_stacked: list[torch.Tensor], + topk_weights: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, + device: torch.device, + N: int, + M: int, + EM: int, + K: int, + num_tokens: int, + num_experts: int, + num_slices: int, block_size_m: int, block_size_n: int, block_size_k: int, group_size_m: int, + num_warps: int, + num_stages: int, + split_k: int, + mul_routed_weight: bool = False, +) -> None: + return + + +def _fused_moe_lora_expand_fake( + output: torch.Tensor, + a_intermediate_cache1: torch.Tensor, + lora_b_stacked: list[torch.Tensor], + topk_weights: torch.Tensor, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + top_k_num: int, + lora_ids: torch.Tensor, + adapter_enabled: torch.Tensor, + device: torch.device, + N: int, + M: int, + EM: int, + K: int, + num_tokens: int, + num_experts: int, + num_slices: int, + max_lora_rank: int, + w1_output_dim_size: int, + block_size_m: int, + block_size_n: int, + block_size_k: int, + group_size_m: int, + num_warps: int, + num_stages: int, + split_k: int, mul_routed_weight: bool = False, ) -> None: return @@ -383,7 +600,26 @@ try: mutates_args=["output"], fake_impl=_fused_moe_lora_fake, ) + + direct_register_custom_op( + op_name="fused_moe_lora_shrink", + op_func=_fused_moe_lora_shrink, + mutates_args=["a_intermediate_cache1"], + fake_impl=_fused_moe_lora_shrink_fake, + ) + + direct_register_custom_op( + op_name="fused_moe_lora_expand", + op_func=_fused_moe_lora_expand, + mutates_args=["output"], + fake_impl=_fused_moe_lora_expand_fake, + ) + fused_moe_lora = torch.ops.vllm.fused_moe_lora + fused_moe_lora_shrink = torch.ops.vllm.fused_moe_lora_shrink + fused_moe_lora_expand = torch.ops.vllm.fused_moe_lora_expand except AttributeError: fused_moe_lora = _fused_moe_lora + fused_moe_lora_shrink = _fused_moe_lora_shrink + fused_moe_lora_expand = _fused_moe_lora_expand diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 368c5037d2e4d..bd413a6db26b8 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -154,13 +154,13 @@ def load_lora_op_config(op_type: str, add_inputs: bool | None) -> dict | None: gpu_name = gpu_name.replace("-", "_") config_fname = None - if op_type == "shrink": - config_fname = f"{gpu_name}_{op_type.upper()}.json" - else: - assert op_type == "expand" + # only expand op needs to consider add_inputs + if op_type == "expand": config_fname = ( f"{gpu_name}_{op_type.upper()}_{str(add_inputs).upper()}.json" ) + else: + config_fname = f"{gpu_name}_{op_type.upper()}.json" config_path = Path(f"{user_defined_config_folder}/{config_fname}") if not config_path.exists(): @@ -186,8 +186,17 @@ def get_lora_op_configs( rank: int, num_slices: int, add_inputs: bool | None = None, + moe_intermediate_size: int | None = None, ) -> dict[str, int | None]: - assert op_type in ["shrink", "expand"] + # Add support for fused_moe_lora ops + assert op_type in [ + "shrink", + "expand", + "fused_moe_lora_w13_shrink", + "fused_moe_lora_w13_expand", + "fused_moe_lora_w2_shrink", + "fused_moe_lora_w2_expand", + ] # default config default = {} @@ -203,6 +212,22 @@ def get_lora_op_configs( "num_stages": 2, "max_nreg": None, } + # The default config for fused_moe_lora ops + elif op_type in [ + "fused_moe_lora_w13_shrink", + "fused_moe_lora_w13_expand", + "fused_moe_lora_w2_shrink", + "fused_moe_lora_w2_expand", + ]: + default = { + "block_m": 64, + "block_n": 64, + "block_k": 32, + "num_warps": 4, + "num_stages": 3, + "group_size_m": 8, + "split_k": 1, + } else: default = { "block_m": 64, @@ -247,5 +272,13 @@ def get_lora_op_configs( or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - n))] ) + # slice by moe-intermediate-size if applicable + if moe_intermediate_size is not None: + i = moe_intermediate_size + config_data = ( + config_data.get(str(i)) + or config_data[min(config_data.keys(), key=lambda x: abs(int(x) - i))] + ) + assert config_data is not None return config_data diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index c552412cfd62e..b6186e8561529 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -479,7 +479,8 @@ class PunicaWrapperBase(PunicaWrapperABC): num_tokens_post_padded: torch.Tensor, max_lora_rank: int, top_k_num: int, - config, + shrink_config, + expand_config, adapter_enabled: torch.Tensor, mul_routed_weight=False, ): diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 30def90380db1..1bb80e516d3f8 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -367,7 +367,8 @@ class PunicaWrapperGPU(PunicaWrapperBase): num_tokens_post_padded: torch.Tensor, max_lora_rank: int, top_k_num: int, - config, + shrink_config, + expand_config, adapter_enabled: torch.Tensor, mul_routed_weight=False, ): @@ -388,10 +389,19 @@ class PunicaWrapperGPU(PunicaWrapperBase): top_k_num, lora_ids, adapter_enabled, - config["BLOCK_SIZE_M"], - config["BLOCK_SIZE_N"], - config["BLOCK_SIZE_K"], - config["GROUP_SIZE_M"], - config.get("SPLIT_K", 1), + shrink_config.get("BLOCK_SIZE_M", 64), + shrink_config.get("BLOCK_SIZE_N", 64), + shrink_config.get("BLOCK_SIZE_K", 32), + shrink_config.get("GROUP_SIZE_M", 8), + shrink_config.get("NUM_WARPS", 4), + shrink_config.get("NUM_STAGES", 3), + shrink_config.get("SPLIT_K", 1), + expand_config.get("BLOCK_SIZE_M", 64), + expand_config.get("BLOCK_SIZE_N", 64), + expand_config.get("BLOCK_SIZE_K", 32), + expand_config.get("GROUP_SIZE_M", 8), + expand_config.get("NUM_WARPS", 4), + expand_config.get("NUM_STAGES", 3), + expand_config.get("SPLIT_K", 1), mul_routed_weight, ) From 03c4c4aa9deb2ad09a95c7997d2e5578c8db68d6 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 4 Nov 2025 03:00:57 -0800 Subject: [PATCH 118/976] Support using Int4PreshuffledTensor after loading (#26066) Signed-off-by: Jerry Zhang --- tests/quantization/test_torchao.py | 146 +++++++++++++++++- .../layers/quantization/torchao.py | 66 +++++++- 2 files changed, 208 insertions(+), 4 deletions(-) diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index cab198a2a15e2..82413f36e997f 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -99,7 +99,7 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner): @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") -def test_on_the_fly_quant_config_dict_json(vllm_runner): +def test_online_quant_config_dict_json(vllm_runner): """Testing on the fly quantization, load_weights integration point, with config dict serialized to json string """ @@ -133,7 +133,7 @@ def test_on_the_fly_quant_config_dict_json(vllm_runner): @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") -def test_on_the_fly_quant_config_file(vllm_runner): +def test_online_quant_config_file(vllm_runner): """Testing on the fly quantization, load_weights integration point, with config file """ @@ -252,6 +252,148 @@ def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner): ) as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=4) + assert output + + +@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") +@pytest.mark.skip( + reason="since torchao nightly is only compatible with torch nightly" + "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip " + "torchao tests that requires newer versions (0.14.0.dev+) for now" +) +def test_opt_125m_int4wo_model_running_preshuffled_kernel(vllm_runner, monkeypatch): + """We load a model with Int4Tensor (plain format) linear weights + and verify that the weight is updated to Int4PreshuffledTensor + after loading in vllm + """ + from torchao.quantization import Int4PreshuffledTensor + from torchao.utils import _is_fbgemm_gpu_genai_available, is_sm_at_least_90 + + torch._dynamo.reset() + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + model_name = "torchao-testing/opt-125m-Int4WeightOnlyConfig-v2-0.14.0.dev" + # Note: using enforce_eager=True because the `bf16i4bf16_shuffled` doesn't + # have meta kernel implemented yet, can remove this flag after that is implemented + with vllm_runner( + model_name=model_name, + quantization="torchao", + dtype="bfloat16", + pt_load_map_location="cuda:0", + enforce_eager=True, + ) as llm: + + def has_int4_preshuffled_tensor_weight(model): + return isinstance( + model.model.decoder.layers[0].self_attn.qkv_proj.weight, + Int4PreshuffledTensor, + ) + + def get_weight_attrs(model): + weight = model.model.decoder.layers[0].self_attn.qkv_proj.weight + return [ + weight.requires_grad, + weight.input_dim, + weight.output_dim, + hasattr(weight, "weight_loader"), + ] + + llm_engine = llm.get_llm().llm_engine + has_int4_preshuffled_tensor = any( + llm_engine.apply_model(has_int4_preshuffled_tensor_weight) + ) + weight_attrs = llm_engine.apply_model(get_weight_attrs)[0] + + # making sure we are using Int4PreshuffledTensor on H100 GPU, when + # fbgemm_gpu_genai + # library is installed, otherwise it should be using Int4Tensor + if _is_fbgemm_gpu_genai_available() and is_sm_at_least_90(): + assert has_int4_preshuffled_tensor + else: + assert not has_int4_preshuffled_tensor + + assert weight_attrs == [False, 1, 0, True] + output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + + assert output + + +@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") +@pytest.mark.skip( + reason="since torchao nightly is only compatible with torch nightly" + "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip " + "torchao tests that requires newer versions (0.14.0.dev+) for now" +) +def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant( + vllm_runner, monkeypatch +): + """We load a bf16 model and online quantize the model to int4, then verify that + the weights are updated to Int4PreshuffledTensor after online quantization + """ + from torchao.quantization import Int4PreshuffledTensor + from torchao.utils import _is_fbgemm_gpu_genai_available, is_sm_at_least_90 + + torch._dynamo.reset() + model_name = "facebook/opt-125m" + + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + import json + + from torchao.core.config import config_to_dict + from torchao.quantization import Int4WeightOnlyConfig + + torchao_quant_config = Int4WeightOnlyConfig( + group_size=128, int4_packing_format="plain" + ) + hf_overrides = { + "quantization_config_dict_json": json.dumps( + config_to_dict(torchao_quant_config) + ) + } + + # Note: using enforce_eager=True because the `bf16i4bf16_shuffled` doesn't + # have meta kernel implemented yet, can remove this flag after that is implemented + with vllm_runner( + model_name=model_name, + quantization="torchao", + dtype="bfloat16", + pt_load_map_location="cuda:0", + hf_overrides=hf_overrides, + enforce_eager=True, + ) as llm: + + def has_int4_preshuffled_tensor_weight(model): + return isinstance( + model.model.decoder.layers[0].self_attn.qkv_proj.weight, + Int4PreshuffledTensor, + ) + + def get_weight_attrs(model): + weight = model.model.decoder.layers[0].self_attn.qkv_proj.weight + return [ + weight.requires_grad, + weight.input_dim, + weight.output_dim, + hasattr(weight, "weight_loader"), + ] + + llm_engine = llm.get_llm().llm_engine + has_int4_preshuffled_tensor = any( + llm_engine.apply_model(has_int4_preshuffled_tensor_weight) + ) + weight_attrs = llm_engine.apply_model(get_weight_attrs)[0] + + # making sure we are using Int4PreshuffledTensor on H100 GPU, when + # fbgemm_gpu_genai + # library is installed, otherwise it should be using Int4Tensor + if _is_fbgemm_gpu_genai_available() and is_sm_at_least_90(): + assert has_int4_preshuffled_tensor + else: + assert not has_int4_preshuffled_tensor + + assert weight_attrs == [False, 1, 0, True] + output = llm.generate_greedy(["The capital of France is"], max_tokens=32) + assert output diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index f42c45dae76d2..3fee71e193db5 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib import json +import types from importlib.util import find_spec from typing import Any, Optional @@ -27,6 +28,39 @@ from vllm.model_executor.utils import set_weight_attrs logger = init_logger(__name__) +def _bond_method_to_cls(func, obj): + if hasattr(func, "__self__") or not callable(func): + # If the function is already bound to an instance, return it as is + return func + else: + return types.MethodType(func, obj) + + +def _get_weight_attrs(param): + # record attributes attached to the weight, so we can + # recover later + recorded_weight_attr = {} + for key in param.__dict__: + if hasattr(param, key): + attr = getattr(param, key) + if not callable(attr): + recorded_weight_attr[key] = attr + elif hasattr(attr, "__self__") and param is attr.__self__: + # if attr is a bonded method for an instance, and + # attr.__self__ points to the instance (param) + # we'll record the underlying function object + recorded_weight_attr[key] = attr.__func__ + else: + recorded_weight_attr[key] = attr + return recorded_weight_attr + + +def _restore_weight_attrs(param, recorded_weight_attr): + for attr_name, attr in recorded_weight_attr.items(): + if not hasattr(param, attr_name): + setattr(param, attr_name, _bond_method_to_cls(attr, param)) + + def torchao_version_at_least(torchao_version: str) -> bool: if find_spec("torchao"): try: @@ -57,6 +91,14 @@ def should_skip(prefix: str, skip_modules: list[str]) -> bool: return False +if torchao_version_at_least("0.15.0"): + from torchao.prototype.tensor_conversion.api import ( + convert_to_packed_tensor_based_on_current_hardware, + ) +else: + convert_to_packed_tensor_based_on_current_hardware = lambda t: t + + class TorchAOConfig(QuantizationConfig): """Config class for torchao.""" @@ -307,12 +349,32 @@ class TorchAOLinearMethod(LinearMethodBase): def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if self.quant_config.is_checkpoint_torchao_serialized: + if not hasattr(layer, "weight"): + return + + # record attributes attached to the weight, so we can + # recover later + recorded_weight_attr = _get_weight_attrs(layer.weight) + + layer.weight = Parameter( + convert_to_packed_tensor_based_on_current_hardware(layer.weight), + requires_grad=layer.weight.requires_grad, + ) + + _restore_weight_attrs(layer.weight, recorded_weight_attr) return - # quantize the weight on the fly if the checkpoint is not already + # online quantize the weight if the checkpoint is not already # quantized by torchao + recorded_weight_attr = _get_weight_attrs(layer.weight) + weight = torchao_quantize_param_data( layer.weight, self.quant_config.torchao_config ) - set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + weight = torch.nn.Parameter( + convert_to_packed_tensor_based_on_current_hardware(weight), + weight.requires_grad, + ) + + _restore_weight_attrs(weight, recorded_weight_attr) layer.register_parameter("weight", weight) From 300a2659785fb925f347637d5639d74cc2c5a9f5 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Tue, 4 Nov 2025 04:13:35 -0800 Subject: [PATCH 119/976] [Core] Enable StatLogger in LLMEngine (#28020) Signed-off-by: Zhuohan Li --- vllm/v1/engine/llm_engine.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index f44b6b2070d9f..995642a8356fc 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -58,11 +58,6 @@ class LLMEngine: use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: - if stat_loggers is not None: - raise NotImplementedError( - "Passing StatLoggers to LLMEngine is not yet supported." - ) - self.vllm_config = vllm_config self.observability_config = vllm_config.observability_config self.model_config = vllm_config.model_config From 77f8001f533021ece46779f5b7e69edc1d3b514f Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Tue, 4 Nov 2025 14:28:36 +0200 Subject: [PATCH 120/976] [Model][Bugfix] fix pipeline parallelism support for NemotronH (#27968) Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- vllm/model_executor/models/nemotron_h.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 457d3910d0e57..324b63c1732fe 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -20,6 +20,7 @@ import typing from collections.abc import Callable, Iterable +from itertools import islice import torch from torch import nn @@ -549,7 +550,7 @@ class NemotronHModel(nn.Module): self.start_layer, self.end_layer, self.layers = make_layers( len(config.hybrid_override_pattern), get_layer, prefix=f"{prefix}.layers" ) - self.make_empty_intmd_tensors = make_empty_intermediate_tensors_factory( + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size ) @@ -564,7 +565,7 @@ class NemotronHModel(nn.Module): positions: torch.Tensor, intermediate_tensors: IntermediateTensors | None = None, inputs_embeds: torch.Tensor | None = None, - ) -> torch.Tensor: + ) -> torch.Tensor | IntermediateTensors: if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds @@ -576,8 +577,7 @@ class NemotronHModel(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - residual = None - for i, layer in enumerate(self.layers): + for layer in islice(self.layers, self.start_layer, self.end_layer): hidden_states, residual = layer( positions=positions, hidden_states=hidden_states, @@ -633,6 +633,9 @@ class NemotronHModel(nn.Module): if name.endswith(".bias") and name not in params_dict: continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -678,6 +681,9 @@ class NemotronHModel(nn.Module): if is_expert_weight: continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader @@ -792,7 +798,9 @@ class NemotronHForCausalLM( self.unpadded_vocab_size, config.vocab_size ) - self.make_empty_intmd_tensors = self.model.make_empty_intmd_tensors + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) # Set MoE hyperparameters if self.model.has_moe: From e4ee6586721cd9e09ac50207cb5e754d7a4a773e Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Tue, 4 Nov 2025 14:59:43 +0200 Subject: [PATCH 121/976] [Model] add optimal triton fused moe configs for NemotronH MoE (#27967) Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> --- benchmarks/kernels/benchmark_moe.py | 1 + ...856,device_name=NVIDIA_H100_80GB_HBM3.json | 147 ++++++++++++++++++ .../E=128,N=1856,device_name=NVIDIA_L40S.json | 147 ++++++++++++++++++ ...928,device_name=NVIDIA_H100_80GB_HBM3.json | 147 ++++++++++++++++++ .../E=128,N=928,device_name=NVIDIA_L40S.json | 147 ++++++++++++++++++ 5 files changed, 589 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index bc6cf83bc21fd..33c83574467cc 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -590,6 +590,7 @@ def main(args: argparse.Namespace): "DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM", "Glm4MoeForCausalLM", + "NemotronHForCausalLM", ): E = config.n_routed_experts topk = config.num_experts_per_tok diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..ee8a28b833d5a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json new file mode 100644 index 0000000000000..09d3fa584edd8 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1856,device_name=NVIDIA_L40S.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..fc6454ebfb2fe --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "3072": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json new file mode 100644 index 0000000000000..48997646d99b6 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=928,device_name=NVIDIA_L40S.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + } +} From 938772af03ce01590c7e92b0d3fd0a5bdc899d19 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Tue, 4 Nov 2025 08:59:45 -0500 Subject: [PATCH 122/976] [Kernels] Isolate modular kernel code from FusedMoEMethodBase subclasses. (#27123) --- .../base_device_communicator.py | 4 +- vllm/model_executor/layers/fused_moe/layer.py | 261 +++++++++++++----- .../layers/fused_moe/modular_kernel.py | 6 + .../layers/quantization/awq_marlin.py | 2 - .../layers/quantization/bitsandbytes.py | 3 +- .../compressed_tensors_moe.py | 47 ---- .../layers/quantization/experts_int8.py | 2 - .../model_executor/layers/quantization/fp8.py | 35 +-- .../layers/quantization/gguf.py | 2 - .../layers/quantization/gptq_marlin.py | 2 - .../layers/quantization/modelopt.py | 50 +--- .../layers/quantization/moe_wna16.py | 2 - .../layers/quantization/mxfp4.py | 105 +------ .../layers/quantization/quark/quark_moe.py | 53 ++-- .../model_executor/layers/quantization/rtn.py | 2 - .../model_executor/warmup/deep_gemm_warmup.py | 6 +- 16 files changed, 271 insertions(+), 311 deletions(-) diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 9566dbac7f22f..3a849da70e4cb 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -266,14 +266,14 @@ class DeviceCommunicatorBase: module for module in model.modules() # TODO(bnell): Should use isinstance but can't. Maybe search for - # presence of quant_method.init_prepare_finalize? + # presence of quant_method.maybe_init_modular_kernel? if ( module.__class__.__name__ == "FusedMoE" or module.__class__.__name__ == "SharedFusedMoE" ) ] for module in moe_modules: - module.quant_method.init_prepare_finalize(module) + module.maybe_init_modular_kernel() def dispatch( self, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 55aa2593193ab..118d5fa6b45c4 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -117,10 +117,8 @@ class FusedMoeWeightScaleSupported(Enum): class FusedMoEMethodBase(QuantizeMethodBase): def __init__(self, moe: FusedMoEConfig): super().__init__() - self.moe = moe + self.moe: FusedMoEConfig = moe self.moe_quant_config: FusedMoEQuantConfig | None = None - self.fused_experts: FusedMoEModularKernel | None = None - self.topk_indices_dtype = None @abstractmethod def create_weights( @@ -245,9 +243,9 @@ class FusedMoEMethodBase(QuantizeMethodBase): else: return None - # Note: init_prepare_finalize should only be called by - # prepare_communication_buffer_for_model. - def init_prepare_finalize(self, layer: torch.nn.Module): + def maybe_init_modular_kernel( + self, layer: torch.nn.Module + ) -> FusedMoEModularKernel | None: assert self.moe is not None # We must get the quant config here so that the layer is @@ -261,17 +259,14 @@ class FusedMoEMethodBase(QuantizeMethodBase): logger.debug( "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self) ) - assert self.topk_indices_dtype is None - assert self.fused_experts is None, ( - f"Attempt to override experts for {id(self)}!" - ) - self.topk_indices_dtype = prepare_finalize.topk_indices_dtype() experts = self.select_gemm_impl(prepare_finalize, layer) - self.fused_experts = FusedMoEModularKernel( + return FusedMoEModularKernel( prepare_finalize, experts, layer.shared_experts, ) + else: + return None def select_gemm_impl( self, @@ -292,8 +287,16 @@ class FusedMoEMethodBase(QuantizeMethodBase): raise NotImplementedError @property - def using_modular_kernel(self) -> bool: - return self.fused_experts is not None + def topk_indices_dtype(self) -> torch.dtype | None: + return None + + @property + def supports_eplb(self) -> bool: + return False + + @property + def allow_inplace(self) -> bool: + return False @abstractmethod def apply( @@ -322,6 +325,138 @@ class FusedMoEMethodBase(QuantizeMethodBase): raise NotImplementedError +@CustomOp.register("modular_fused_moe") +class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): + def __init__( + self, + old_quant_method: FusedMoEMethodBase, + fused_experts: FusedMoEModularKernel, + ): + super().__init__(old_quant_method.moe) + # Find better way to copy attributes? Should we even copy attributes? + # self.__dict__.update(old_quant_method.__dict__) + self.moe_quant_config = old_quant_method.moe_quant_config + self.fused_experts = fused_experts + self.disable_expert_map = getattr( + old_quant_method, + "disable_expert_map", + not fused_experts.supports_expert_map(), + ) + self.old_quant_method = old_quant_method + logger.debug("Swapping out %s", self.old_quant_method.__class__.__name__) + + @property + def topk_indices_dtype(self) -> torch.dtype | None: + return self.fused_experts.prepare_finalize.topk_indices_dtype() + + @property + def supports_eplb(self) -> bool: + return self.old_quant_method.supports_eplb + + @property + def allow_inplace(self) -> bool: + return self.old_quant_method.allow_inplace + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + raise NotImplementedError + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + return self.moe_quant_config + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: int | None = None, + num_expert_group: int | None = None, + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + custom_routing_function: Callable | None = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: torch.Tensor | None = None, + logical_to_physical_map: torch.Tensor | None = None, + logical_replica_count: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + # Is getattr needed? + zero_expert_num = getattr(layer, "zero_expert_num", 0) + zero_expert_type = getattr(layer, "zero_expert_type", None) + + if enable_eplb: + if self.supports_eplb: + assert expert_load_view is not None + assert logical_to_physical_map is not None + assert logical_replica_count is not None + assert isinstance(layer, FusedMoE) + else: + raise NotImplementedError( + "EPLB is not supported for " + f"{self.old_quant_method.__class__.__name__}." + ) + + topk_weights, topk_ids, zero_expert_result = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + enable_eplb=enable_eplb, + expert_map=expert_map, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + global_num_experts=global_num_experts, + zero_expert_num=zero_expert_num, + zero_expert_type=zero_expert_type, + ) + + result = self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=self.allow_inplace, + activation=activation, + global_num_experts=global_num_experts, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=None if self.disable_expert_map else expert_map, + ) + + if zero_expert_num != 0 and zero_expert_type is not None: + assert not isinstance(result, tuple), ( + "Shared + zero experts are mutually exclusive not yet supported" + ) + return result, zero_expert_result + else: + return result + + @CustomOp.register("unquantized_fused_moe") class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" @@ -378,6 +513,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ) self.flashinfer_cutlass_moe = None # type: ignore + @property + def supports_eplb(self) -> bool: + return True + + @property + def allow_inplace(self) -> bool: + return True + def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None: if self.rocm_aiter_moe_enabled: return None @@ -650,7 +793,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ) if self.rocm_aiter_moe_enabled: - assert self.fused_experts is None result = self.rocm_aiter_fused_experts( hidden_states=x, w1=layer.w13_weight, @@ -671,21 +813,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): activation=activation, apply_router_weight_on_input=apply_router_weight_on_input, ) - elif self.fused_experts is not None: - result = self.fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, - ) else: - assert fused_experts is not None result = fused_experts( hidden_states=x, w1=layer.w13_weight, @@ -1267,7 +1395,7 @@ class FusedMoE(CustomOp): "Only softmax scoring function is supported for non-grouped topk." ) - moe = FusedMoEConfig( + self.moe_config: FusedMoEConfig = FusedMoEConfig( num_experts=self.global_num_experts, experts_per_token=top_k, hidden_dim=hidden_size, @@ -1279,24 +1407,26 @@ class FusedMoE(CustomOp): is_act_and_mul=is_act_and_mul, is_lora_enabled=vllm_config.lora_config is not None, ) - self.moe_config: FusedMoEConfig = moe + self.moe_quant_config: FusedMoEQuantConfig | None = None self.quant_config = quant_config + def _get_quant_method() -> FusedMoEMethodBase: + """ + Helper method to ensure self.quant_method is never None and + of the proper type. + """ + quant_method = None + if self.quant_config is not None: + quant_method = self.quant_config.get_quant_method(self, prefix) + if quant_method is None: + quant_method = UnquantizedFusedMoEMethod(self.moe_config) + assert isinstance(quant_method, FusedMoEMethodBase) + return quant_method + # Note: get_quant_method will look at the layer's local_num_experts # for heuristic purposes, so it must be initialized first. - quant_method: QuantizeMethodBase | None = None - quant_method = ( - UnquantizedFusedMoEMethod(moe) - if quant_config is None - else quant_config.get_quant_method(self, prefix) - ) - if quant_method is None: - quant_method = UnquantizedFusedMoEMethod(moe) - - assert quant_method is not None - assert isinstance(quant_method, FusedMoEMethodBase) - self.quant_method = quant_method + self.quant_method: FusedMoEMethodBase = _get_quant_method() if not self.moe_config.is_act_and_mul: # Avoid circular import @@ -1305,7 +1435,7 @@ class FusedMoE(CustomOp): ) if not isinstance( - quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod) + self.quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod) ): raise NotImplementedError( "is_act_and_mul=False is supported only for unquantized " @@ -1316,20 +1446,18 @@ class FusedMoE(CustomOp): "is_act_and_mul=False is supported only for CUDA for now" ) - if self.enable_eplb: - from vllm.model_executor.layers.quantization.fp8 import Fp8MoEMethod - - if not isinstance(quant_method, (Fp8MoEMethod, UnquantizedFusedMoEMethod)): - # TODO: Add support for additional quantization methods. - # The implementation for other quantization methods does not - # contain essential differences, but the current quant API - # design causes duplicated work when extending to new - # quantization methods, so I'm leaving it for now. - # If you plan to add support for more quantization methods, - # please refer to the implementation in `Fp8MoEMethod`. - raise NotImplementedError( - "EPLB is only supported for FP8 quantization for now." - ) + if self.enable_eplb and not self.quant_method.supports_eplb: + # TODO: Add support for additional quantization methods. + # The implementation for other quantization methods does not + # contain essential differences, but the current quant API + # design causes duplicated work when extending to new + # quantization methods, so I'm leaving it for now. + # If you plan to add support for more quantization methods, + # please refer to the implementation in `Fp8MoEMethod`. + raise NotImplementedError( + f"EPLB is not supported {self.quant_method.__class__.__name__}. " + "EPLB is only supported for FP8 quantization for now." + ) moe_quant_params = { "num_experts": self.local_num_experts, @@ -1353,6 +1481,15 @@ class FusedMoE(CustomOp): self.batched_hidden_states: torch.Tensor | None = None self.batched_router_logits: torch.Tensor | None = None + # Note: maybe_init_modular_kernel should only be called by + # prepare_communication_buffer_for_model. + # This is called after all weight loading and post-processing, so it + # should be safe to swap out the quant_method. + def maybe_init_modular_kernel(self) -> None: + mk = self.quant_method.maybe_init_modular_kernel(self) + if mk is not None: + self.quant_method = FusedMoEModularMethod(self.quant_method, mk) + @property def shared_experts(self) -> torch.nn.Module | None: return None @@ -2167,7 +2304,7 @@ class FusedMoE(CustomOp): """ assert self.quant_method is not None return ( - self.quant_method.fused_experts is not None + isinstance(self.quant_method, FusedMoEModularMethod) and self.quant_method.fused_experts.output_is_reduced() ) @@ -2403,7 +2540,7 @@ class FusedMoE(CustomOp): self.ensure_dp_chunking_init() has_separate_shared_experts = ( - not isinstance(self.quant_method.fused_experts, FusedMoEModularKernel) + not isinstance(self.quant_method, FusedMoEModularMethod) and self.shared_experts is not None ) @@ -2430,8 +2567,8 @@ class FusedMoE(CustomOp): hidden_states, router_logits, has_separate_shared_experts ) - do_naive_dispatch_combine: bool = ( - self.dp_size > 1 and not self.quant_method.using_modular_kernel + do_naive_dispatch_combine: bool = self.dp_size > 1 and not isinstance( + self.quant_method, FusedMoEModularMethod ) # If there are shared experts but we are not using a modular kernel, the diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 3b5916f8ccaf8..b5fa2c71bec58 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -707,6 +707,12 @@ class FusedMoEModularKernel(torch.nn.Module): f"{fused_experts.activation_formats[0]}" ) + def supports_expert_map(self) -> bool: + """ + A flag indicating whether or not this class supports expert maps. + """ + return self.fused_experts.supports_expert_map() + def output_is_reduced(self) -> bool: """ Indicates whether or not the output of fused MoE kernel diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index daf7422963f3c..3e1f87b59a34d 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -617,8 +617,6 @@ class AWQMoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.") diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index ccd9b311cc932..e5a741e639ad9 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -518,12 +518,11 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError( "EPLB not supported for `BitsAndBytesMoEMethod` yet." ) + topk_weights, topk_ids, _ = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index bf38c15b47013..d95d49eddfe3a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -462,12 +462,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): indices_type=self.topk_indices_dtype, ) - # - # Note: the order here is important. self.fused_experts can override - # flashinfer cutlass, cutlass fp4 or fused_experts but not marlin. - # if self.use_marlin: - assert self.fused_experts is None return fused_marlin_moe( x, layer.w13_weight, @@ -488,24 +483,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): workspace=layer.workspace, ) - elif self.fused_experts is not None: - assert is_valid_flashinfer_cutlass_fused_moe( - x, layer.w13_weight, layer.w2_weight - ), "Flashinfer CUTLASS Fused MoE not applicable!" - - return self.fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=False, # TODO(shuw): fix later, now output is high prec - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, - ) - # FlashInfer fused experts path elif self.allow_flashinfer: from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 @@ -1066,13 +1043,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL - # - # Note: the order here is important. self.fused_experts can override - # cutlass fp8 or fused_experts but not marlin or rocm. - # if self.use_marlin: assert activation == "silu", f"{activation} not supported for Marlin MoE." - assert self.fused_experts is None return fused_marlin_moe( x, layer.w13_weight, @@ -1098,7 +1070,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): assert per_act_token == per_channel_quant assert self.moe_quant_config is not None - assert self.fused_experts is None return rocm_aiter_fused_experts( hidden_states=x, w1=layer.w13_weight, @@ -1111,18 +1082,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): quant_config=self.moe_quant_config, ) - elif self.fused_experts is not None: - return self.fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - activation=activation, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, - ) - # cutlass path elif self.use_cutlass: assert self.moe_quant_config is not None @@ -1318,8 +1277,6 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError( "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet." @@ -1636,8 +1593,6 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError( "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet." @@ -1901,8 +1856,6 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError( "EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet." diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 754608af97c6b..5241f9a2301be 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -158,8 +158,6 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError( "EPLB not supported for `ExpertsInt8MoEMethod` yet." diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index f82eccb88ce09..03eca199d536d 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -703,9 +703,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): self.quant_config = quant_config self.weight_block_size = self.quant_config.weight_block_size self.block_quant: bool = self.weight_block_size is not None - - self.fused_experts: mk.FusedMoEModularKernel | None = None # type: ignore - self.fp8_backend = get_fp8_moe_backend(self.block_quant) self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN @@ -1181,6 +1178,14 @@ class Fp8MoEMethod(FusedMoEMethodBase): block_shape=self.weight_block_size, ) + @property + def supports_eplb(self) -> bool: + return True + + @property + def allow_inplace(self) -> bool: + return True + def apply( self, layer: torch.nn.Module, @@ -1210,10 +1215,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): assert logical_replica_count is not None assert isinstance(layer, FusedMoE) - if ( - self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM - and self.fused_experts is None - ): + if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: assert activation == "silu", ( f"Expected 'silu' activation but got {activation}" ) @@ -1290,10 +1292,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): num_fused_shared_experts=layer.num_fused_shared_experts, ) - # - # Note: the order of checks is important since self.fused_experts - # can override fused_experts or cutlass but not rocm or marlin. - # topk_weights, topk_ids, zero_expert_result = select_result if self.rocm_aiter_moe_enabled: @@ -1301,7 +1299,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): rocm_aiter_fused_experts, ) - assert self.fused_experts is None result = rocm_aiter_fused_experts( x, layer.w13_weight, @@ -1315,7 +1312,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) elif self.use_marlin: assert activation == "silu", f"{activation} not supported for Marlin MoE." - assert self.fused_experts is None result = fused_marlin_moe( x, layer.w13_weight, @@ -1333,19 +1329,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): expert_map=expert_map, workspace=layer.workspace, ) - elif self.fused_experts: - result = self.fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - global_num_experts=global_num_experts, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, - ) elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: assert not self.block_quant assert not renormalize and custom_routing_function is not None diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 8a914c57a9f7d..caabcd0ca0ee5 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -585,8 +585,6 @@ class GGUFMoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.") diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 0d5439357fda2..42a569e7770c0 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -742,8 +742,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError( "EPLB not supported for `GPTQMarlinMoEMethod` yet." diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 37b682984fc35..f61d2a52925d9 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -18,9 +18,6 @@ from vllm.model_executor.layers.fused_moe.config import ( fp8_w8a8_moe_quant_config, nvfp4_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( - is_valid_flashinfer_cutlass_fused_moe, -) from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, @@ -605,7 +602,6 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: - assert self.fused_experts is None assert activation == "silu", ( f"Expected 'silu' activation but got {activation}" ) @@ -638,24 +634,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): indices_type=self.topk_indices_dtype, ) - # - # Note: the order here is important. self.fused_experts can override - # cutlass or fused_experts. - # - if self.fused_experts is not None: - return self.fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, - ) - elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: assert not renormalize assert activation == "silu", ( f"Expected 'silu' activation but got {activation}" @@ -1647,8 +1626,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): from vllm.model_executor.models.llama4 import Llama4MoE - assert self.fused_experts is None - a1_gscale = layer.w13_input_scale_quant (hidden_states_fp4, hidden_states_scale_linear_fp4) = ( flashinfer.fp4_quantize( @@ -1720,13 +1697,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): indices_type=self.topk_indices_dtype, ) - # - # Note: the order here is important. self.fused_experts can override - # flashinfer cutlass, cutlass fp4 or fused_experts but not marlin or - # trtllm. - # if self.use_marlin: - assert self.fused_experts is None return fused_marlin_moe( x, layer.w13_weight, @@ -1747,23 +1718,24 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): workspace=layer.workspace, ) - elif self.fused_experts is not None: - assert ( - self.allow_flashinfer - and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS + elif ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS + ): + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 + flashinfer_cutlass_moe_fp4, ) - assert is_valid_flashinfer_cutlass_fused_moe( - x, layer.w13_weight, layer.w2_weight - ), "Flashinfer CUTLASS Fused MoE not applicable!" + assert self.moe_quant_config is not None - return self.fused_experts( + return flashinfer_cutlass_moe_fp4( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - inplace=False, # TODO(shuw): fix later, now output is high prec + quant_config=self.moe_quant_config, + inplace=False, activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index b0a268b9950b7..2090c86f78dc8 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -226,7 +226,6 @@ class MoeWNA16Method(FusedMoEMethodBase): params_dtype: torch.dtype, **extra_weight_attrs, ): - self.moe = layer layer.quant_config = self.quant_config bit8_pack_factor = self.quant_config.bit8_pack_factor group_size = self.quant_config.group_size @@ -381,7 +380,6 @@ class MoeWNA16Method(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None if enable_eplb: raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.") diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index bf34ec0f38996..7b1600a03d55b 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -197,8 +197,6 @@ class Mxfp4Config(QuantizationConfig): class Mxfp4MoEMethod(FusedMoEMethodBase): def __init__(self, moe: FusedMoEConfig): super().__init__(moe) - self.topk_indices_dtype = None - self.moe = moe self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled) self.max_capture_size = ( get_current_vllm_config().compilation_config.max_cudagraph_capture_size @@ -815,6 +813,18 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): "EP batched experts format" ) else: + layer.w13_weight = ( + self.w13_weight_triton_tensor + if layer.w13_weight is None + else layer.w13_weight + ) + layer.w2_weight = ( + self.w2_weight_triton_tensor + if layer.w2_weight is None + else layer.w2_weight + ) + assert all([w is not None for w in [layer.w13_weight, layer.w2_weight]]) + assert self.moe_quant_config is not None if ( self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM @@ -838,71 +848,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): f"Incompatible Mxfp4 backend ({self.mxfp4_backend}) for EP" ) - def _route_and_experts( - self, - layer: torch.nn.Module, - x: torch.Tensor, - router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, - ) -> torch.Tensor: - assert isinstance(self.fused_experts, mk.FusedMoEModularKernel) - - topk_weights, topk_ids, _ = FusedMoE.select_experts( - hidden_states=x, - router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, - enable_eplb=enable_eplb, - expert_map=expert_map, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) - - w13_weight = ( - self.w13_weight_triton_tensor - if layer.w13_weight is None - else layer.w13_weight - ) - w2_weight = ( - self.w2_weight_triton_tensor if layer.w2_weight is None else layer.w2_weight - ) - assert all([w is not None for w in [w13_weight, w2_weight]]) - - return self.fused_experts( - hidden_states=x, - w1=w13_weight, - w2=w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, - ) + @property + def allow_inplace(self) -> bool: + return True def apply( self, @@ -930,29 +878,6 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): if enable_eplb: raise NotImplementedError("EPLB is not supported for mxfp4") - if self.fused_experts is not None: - return self._route_and_experts( - layer, - x, - router_logits, - top_k, - renormalize, - use_grouped_topk, - topk_group, - num_expert_group, - global_num_experts, - expert_map, - custom_routing_function, - scoring_func, - e_score_correction_bias, - apply_router_weight_on_input, - activation, - enable_eplb, - expert_load_view, - logical_to_physical_map, - logical_replica_count, - ) - if self.mxfp4_backend == Mxfp4Backend.MARLIN: topk_weights, topk_ids, _ = FusedMoE.select_experts( hidden_states=x, diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index a8f4b1b0db68d..8825611051e5d 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -310,7 +310,6 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): # Property to determine if AITER is used if self.rocm_aiter_moe_enabled: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa E501 - rocm_aiter_fused_experts, shuffle_weights, ) @@ -322,17 +321,11 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) - self.rocm_aiter_fused_experts_func = rocm_aiter_fused_experts elif self.use_marlin: prepare_moe_fp8_layer_for_marlin(layer, False) # Activations not quantized for marlin. del layer.w13_input_scale del layer.w2_input_scale - self.fused_experts_func = None - else: - from vllm.model_executor.layers.fused_moe import fused_experts - - self.fused_experts_func = fused_experts def get_fused_moe_quant_config( self, layer: torch.nn.Module @@ -369,8 +362,6 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError( "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet." @@ -392,7 +383,11 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): ) if self.rocm_aiter_moe_enabled: - return self.rocm_aiter_fused_experts_func( + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + rocm_aiter_fused_experts, + ) + + return rocm_aiter_fused_experts( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, @@ -403,7 +398,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): quant_config=self.moe_quant_config, expert_map=expert_map, ) - if self.use_marlin: + elif self.use_marlin: assert activation == "silu", f"{activation} not supported for Marlin MoE." return fused_marlin_moe( x, @@ -421,22 +416,22 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): global_num_experts=global_num_experts, expert_map=expert_map, ) + else: + from vllm.model_executor.layers.fused_moe import fused_experts - assert self.fused_experts_func is not None - - return self.fused_experts_func( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, - quant_config=self.moe_quant_config, - ) + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=self.moe_quant_config, + ) class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): @@ -601,6 +596,10 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): block_shape=None, ) + @property + def allow_inplace(self) -> bool: + return True + def apply( self, layer: torch.nn.Module, @@ -624,8 +623,6 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError( "EPLB not supported for `QuarkOCP_MX_MoEMethod` yet." diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index e4f7ff8339569..52656263a601b 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -377,8 +377,6 @@ class RTNMoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.fused_experts is None - if enable_eplb: raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.") diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index 78cbcd8e5427f..bdcebd498ef01 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -13,7 +13,7 @@ import vllm.envs as envs from vllm.distributed.parallel_state import get_dp_group from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.deep_gemm_utils import compute_aligned_M -from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.fused_moe.layer import FusedMoE, FusedMoEModularMethod from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, @@ -160,8 +160,8 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool: ): return False - if not isinstance(module.quant_method.fused_experts, FusedMoEModularKernel): - # fused_experts could invoke deep_gemm_moe_fp8 + if not isinstance(module.quant_method, FusedMoEModularMethod): + # modular kernels could invoke deep_gemm_moe_fp8 return True mk: FusedMoEModularKernel = module.quant_method.fused_experts From 5a0a6dfd55e1b9b2b518e0d2e91bd2c1241a7694 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 4 Nov 2025 07:38:16 -0800 Subject: [PATCH 123/976] [BugFix] Fix incorrect preallocated sampled_token_ids tensor size (#28025) Signed-off-by: Nick Hill --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e700c09038e28..177542ed96c8e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -524,7 +524,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self._draft_token_ids: list[list[int]] | torch.Tensor | None = None self.transfer_event = torch.cuda.Event() self.sampled_token_ids_pinned_cpu = torch.empty( - (self.max_model_len, 1), + (self.max_num_reqs, 1), dtype=torch.int64, device="cpu", pin_memory=self.pin_memory, From 97e3dda84ba79100509fafb58d651bde25e3f32f Mon Sep 17 00:00:00 2001 From: lyrisz <145491716+LyrisZhong@users.noreply.github.com> Date: Tue, 4 Nov 2025 07:49:25 -0800 Subject: [PATCH 124/976] [Perf] SM100 - add swap AB optimization to CUTLASS FP8 GEMM (#27284) Signed-off-by: Faqin Zhong Co-authored-by: Faqin Zhong Co-authored-by: Michael Goin --- .../w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu | 9 +- .../c3x/scaled_mm_sm100_fp8_dispatch.cuh | 276 +++++++++++++++--- 2 files changed, 233 insertions(+), 52 deletions(-) diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu index cf2cccc913f62..62aeb927ccdcb 100644 --- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu +++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu @@ -1,6 +1,5 @@ #include "scaled_mm_kernels.hpp" #include "scaled_mm_sm100_fp8_dispatch.cuh" -#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" namespace vllm { @@ -13,11 +12,11 @@ void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a, if (bias) { TORCH_CHECK(bias->dtype() == out.dtype(), "currently bias dtype must match output dtype ", out.dtype()); - return cutlass_scaled_mm_sm100_fp8_epilogue( - out, a, b, a_scales, b_scales, *bias); + return cutlass_scaled_mm_sm100_fp8_epilogue(out, a, b, a_scales, + b_scales, *bias); } else { - return cutlass_scaled_mm_sm100_fp8_epilogue( - out, a, b, a_scales, b_scales); + return cutlass_scaled_mm_sm100_fp8_epilogue(out, a, b, a_scales, + b_scales); } } diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh index f876b7d9acd87..c950008b4139a 100644 --- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh @@ -2,6 +2,7 @@ #include "scaled_mm.cuh" #include "cutlass_gemm_caller.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" /** * This file defines Gemm kernel configurations for SM100 (fp8) based on the @@ -12,8 +13,88 @@ namespace vllm { using c3x::cutlass_gemm_caller; -template typename Epilogue> +template typename Epilogue_, + typename TileShape, typename ClusterShape, typename KernelSchedule, + typename EpilogueSchedule, bool swap_ab_ = false> +struct cutlass_3x_gemm_sm100_fp8 { + using ElementAB = ElementAB_; + using ElementC = ElementD_; + using ElementD = ElementD_; + using ElementAcc = + typename std::conditional, int32_t, + float>::type; + + using Epilogue = Epilogue_; + + using EVTCompute = typename Epilogue::EVTCompute; + + static constexpr int AlignmentAB = + 128 / cutlass::sizeof_bits::value; + static constexpr int AlignmentCD = + 128 / cutlass::sizeof_bits::value; + + // Compile-time swap_ab flag + static constexpr bool swap_ab = swap_ab_; + + // ----------------------------------------------------------- + // Layout definitions + // ----------------------------------------------------------- + using LayoutA = cutlass::layout::RowMajor; + using LayoutA_T = typename cutlass::layout::LayoutTranspose::type; + + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutB_T = typename cutlass::layout::LayoutTranspose::type; + + using LayoutD = cutlass::layout::RowMajor; + using LayoutD_Transpose = + typename cutlass::layout::LayoutTranspose::type; + + using LayoutC = LayoutD; + using LayoutC_Transpose = LayoutD_Transpose; + + // ----------------------------------------------------------- + // Collective epilogue (conditionally swap operands and layouts) + // ----------------------------------------------------------- + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape, + ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto, + ElementAcc, float, ElementC, + conditional_t, AlignmentCD, + ElementD, conditional_t, + AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp; + + static constexpr size_t CEStorageSize = + sizeof(typename CollectiveEpilogue::SharedStorage); + + using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout< + static_cast(CEStorageSize)>; + + // ----------------------------------------------------------- + // Collective mainloop (conditionally swap operands and layouts) + // ----------------------------------------------------------- + using CollectiveMainloop = conditional_t< + swap_ab, + typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB, + LayoutB_T, AlignmentAB, // Swapped B (as A) + ElementAB, LayoutA_T, AlignmentAB, // Swapped A (as B) + ElementAcc, TileShape, ClusterShape, Stages, + KernelSchedule>::CollectiveOp, + typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB, + LayoutA, AlignmentAB, ElementAB, LayoutB, AlignmentAB, ElementAcc, + TileShape, ClusterShape, Stages, KernelSchedule>::CollectiveOp>; + + // ----------------------------------------------------------- + // Kernel definition + // ----------------------------------------------------------- + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, CollectiveMainloop, CollectiveEpilogue, void>; +}; + +template struct sm100_fp8_config_default { // M in (256, inf) static_assert(std::is_same()); @@ -22,12 +103,16 @@ struct sm100_fp8_config_default { using TileShape = Shape<_256, _128, _128>; using ClusterShape = Shape<_2, _2, _1>; using Cutlass3xGemm = - cutlass_3x_gemm_sm100; + conditional_t, + cutlass_3x_gemm_sm100_fp8< + InType, OutType, c3x::ScaledEpilogue, TileShape, + ClusterShape, KernelSchedule, EpilogueSchedule>>; }; -template typename Epilogue> +template struct sm100_fp8_config_M256 { // M in (64, 256] static_assert(std::is_same()); @@ -36,44 +121,127 @@ struct sm100_fp8_config_M256 { using TileShape = Shape<_128, _128, _128>; using ClusterShape = Shape<_2, _1, _1>; using Cutlass3xGemm = - cutlass_3x_gemm_sm100; + conditional_t, + cutlass_3x_gemm_sm100_fp8< + InType, OutType, c3x::ScaledEpilogue, TileShape, + ClusterShape, KernelSchedule, EpilogueSchedule>>; }; -template typename Epilogue> +template +struct sm100_fp8_config_M64_swap_ab { + // This config is for M in (16, 64] and K >= 4096 + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_128, _64, _256>; + using ClusterShape = Shape<_4, _1, _1>; + + // Use ScaledEpilogueColumnBias instead of ScaledEpilogueBias when doing swap + // AB + using Cutlass3xGemm = conditional_t< + EnableBias, + cutlass_3x_gemm_sm100_fp8, + cutlass_3x_gemm_sm100_fp8>; +}; + +template struct sm100_fp8_config_M64 { - // M in (16, 64] + // This config is for M = 64 and K < 4096 (do not enable swap AB in such case) static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; using TileShape = Shape<_64, _64, _128>; using ClusterShape = Shape<_1, _1, _1>; + using Cutlass3xGemm = - cutlass_3x_gemm_sm100; + conditional_t, + cutlass_3x_gemm_sm100_fp8< + InType, OutType, c3x::ScaledEpilogue, TileShape, + ClusterShape, KernelSchedule, EpilogueSchedule>>; }; -template typename Epilogue> -struct sm100_fp8_config_M16 { +template +struct sm100_fp8_config_M16_swap_ab { // M in [1, 16] static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; - using TileShape = Shape<_64, _64, _128>; - using ClusterShape = Shape<_1, _4, _1>; - using Cutlass3xGemm = - cutlass_3x_gemm_sm100; + using TileShape = Shape<_128, _32, _128>; + using ClusterShape = Shape<_4, _1, _1>; + + // Use ScaledEpilogueColumnBias instead of ScaledEpilogueBias when doing swap + // AB + using Cutlass3xGemm = conditional_t< + EnableBias, + cutlass_3x_gemm_sm100_fp8, + cutlass_3x_gemm_sm100_fp8>; }; -template typename Epilogue, +template +void cutlass_gemm_caller_sm100_fp8(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... epilogue_params) { + static constexpr bool swap_ab = Gemm::swap_ab; + using ElementAB = typename Gemm::ElementAB; + using ElementD = typename Gemm::ElementD; + using GemmKernel = typename Gemm::GemmKernel; + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + + int32_t m = a.size(0), n = b.size(1), k = a.size(1); + auto prob_shape = + swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1); + + StrideA a_stride = + cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1)); + StrideB b_stride = + cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); + StrideC c_stride = cutlass::make_cute_packed_stride( + StrideC{}, + swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1)); + + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + auto c_ptr = static_cast(out.data_ptr()); + + typename GemmKernel::MainloopArguments mainloop_args = + swap_ab ? typename GemmKernel::MainloopArguments{b_ptr, b_stride, a_ptr, + a_stride} + : typename GemmKernel::MainloopArguments{a_ptr, a_stride, b_ptr, + b_stride}; + + typename GemmKernel::EpilogueArguments epilogue_args{ + Gemm::Epilogue::prepare_args( + std::forward(epilogue_params)...), + c_ptr, c_stride, c_ptr, c_stride}; + + c3x::cutlass_gemm_caller(a.device(), prob_shape, mainloop_args, + epilogue_args); +} + +template inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, + torch::Tensor const& a_scales, + torch::Tensor const& b_scales, EpilogueArgs&&... args) { static_assert(std::is_same()); TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn); @@ -81,55 +249,69 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out, using Cutlass3xGemmDefault = typename sm100_fp8_config_default::Cutlass3xGemm; - using Cutlass3xGemmM16 = - typename sm100_fp8_config_M16::Cutlass3xGemm; + EnableBias>::Cutlass3xGemm; + using Cutlass3xGemmM16SwapAB = + typename sm100_fp8_config_M16_swap_ab::Cutlass3xGemm; + using Cutlass3xGemmM64SwapAB = + typename sm100_fp8_config_M64_swap_ab::Cutlass3xGemm; using Cutlass3xGemmM64 = - typename sm100_fp8_config_M64::Cutlass3xGemm; + typename sm100_fp8_config_M64::Cutlass3xGemm; + using Cutlass3xGemmM256 = - typename sm100_fp8_config_M256::Cutlass3xGemm; + typename sm100_fp8_config_M256::Cutlass3xGemm; uint32_t const m = a.size(0); - uint32_t const mp2 = - std::max(static_cast(16), next_pow_2(m)); // next power of 2 + uint32_t const k = a.size(1); - if (mp2 <= 16) { + if (m <= 16) { // m in [1, 16] - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } else if (mp2 <= 64) { + return cutlass_gemm_caller_sm100_fp8( + out, a, b, b_scales, a_scales, std::forward(args)...); + } else if (m <= 64) { // m in (16, 64] - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } else if (mp2 <= 256) { + if (m == 64 && k < 4096) { + // do not enable swap AB + return cutlass_gemm_caller_sm100_fp8( + out, a, b, a_scales, b_scales, std::forward(args)...); + } + return cutlass_gemm_caller_sm100_fp8( + out, a, b, b_scales, a_scales, std::forward(args)...); + + } else if (m <= 256) { // m in (64, 256] - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); + return cutlass_gemm_caller_sm100_fp8( + out, a, b, a_scales, b_scales, std::forward(args)...); } else { // m in (256, inf) - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); + return cutlass_gemm_caller_sm100_fp8( + out, a, b, a_scales, b_scales, std::forward(args)...); } } -template