From 745a3bae1aef2ff3aa70b3eab8624e4571698ba0 Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Thu, 27 Nov 2025 18:48:28 -0800 Subject: [PATCH 01/98] [LoRA] Support FusedMoE LoRA Triton kernel for mxfp4 (#28971) Signed-off-by: Xin Yang Co-authored-by: Jee Jee Li --- .../moe/test_modular_oai_triton_moe.py | 250 ++++++++++++++++++ vllm/lora/layers/fused_moe.py | 37 ++- .../fused_moe/gpt_oss_triton_kernels_moe.py | 146 ++++++++++ .../layers/quantization/mxfp4.py | 20 +- 4 files changed, 441 insertions(+), 12 deletions(-) create mode 100644 tests/kernels/moe/test_modular_oai_triton_moe.py diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py new file mode 100644 index 0000000000000..3361d85e92507 --- /dev/null +++ b/tests/kernels/moe/test_modular_oai_triton_moe.py @@ -0,0 +1,250 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test modular OAI Triton MoE +""" + +import pytest +import torch + +from vllm.utils.import_utils import has_triton_kernels + +if not has_triton_kernels(): + pytest.skip( + "triton_kernels not found, skipping all related tests", + allow_module_level=True, + ) + +from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig +from triton_kernels.numerics import InFlexData +from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp +from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor +from triton_kernels.tensor_details import layout +from triton_kernels.testing import assert_close + +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config +from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + OAITritonExperts, + UnfusedOAITritonExperts, +) +from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) +from vllm.model_executor.layers.utils import shuffle_weight +from vllm.platforms import current_platform + +MNK = [ + (1, 512, 384), + (1, 2880, 2880), + (2, 512, 384), + (2, 2880, 2880), + (32, 2880, 2880), + (64, 2880, 2880), +] + + +def unshuffle_weight(w: torch.Tensor): + first = w[..., ::2] + second = w[..., 1::2] + return torch.concat((first, second), dim=-1) + + +def make_weights(dtype, k, n, e): + w1 = torch.randn((e, k, 2 * n), dtype=dtype, device="cuda") + w1_bias = torch.randn((e, 2 * n), dtype=dtype, device="cuda") + + w2 = torch.randn((e, n, k), dtype=dtype, device="cuda") + w2_bias = torch.randn((e, k), dtype=dtype, device="cuda") + + w1_tri = w1.clone() + w2_tri = w2.clone() + + w1_bias_tri = w1_bias.clone() + w2_bias_tri = w2_bias.clone() + w1_bias_tri = w1_bias_tri.to(torch.float32) + w2_bias_tri = w2_bias_tri.to(torch.float32) + + # shuffle weights + w1_tri = shuffle_weight(w1_tri) + w1_bias_tri = shuffle_weight(w1_bias_tri) + + # quant triton_weights + w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1) + w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, dtype, axis=1) + w1 = unshuffle_weight(w1) + + w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1) + w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, dtype, axis=1) + + num_warps = 8 + w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1) + w_scale_layout, w_scale_layout_opts = ( + layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps) + ) + + w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts) + w1_scale_tri = convert_layout( + wrap_torch_tensor(w1_scale_tri), + w_scale_layout, + **w_scale_layout_opts, + ) + + w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts) + w2_scale_tri = convert_layout( + wrap_torch_tensor(w2_scale_tri), + w_scale_layout, + **w_scale_layout_opts, + ) + + w1_precision_config = PrecisionConfig( + weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData()) + ) + w2_precision_config = PrecisionConfig( + weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData()) + ) + + return ( + w1, + w2, + w1_bias, + w2_bias, + w1_tri, + w2_tri, + w1_bias_tri, + w2_bias_tri, + w1_precision_config, + w2_precision_config, + ) + + +def swiglu(x, alpha: float = 1.702, limit: float = 1.0): + # Note we add an extra bias of 1 to the linear layer + x_glu, x_linear = torch.chunk(x, 2, dim=-1) + if limit is not None: + x_glu = x_glu.clamp(max=limit) + out_glu = x_glu * torch.sigmoid(alpha * x_glu) + if limit is not None: + x_linear = x_linear.clamp(min=-limit, max=limit) + return out_glu * (x_linear + 1) + + +def torch_moe_impl( + hidden_states: torch.Tensor, # (M, K) + w1: torch.Tensor, # (E, K, 2N) + w2: torch.Tensor, # (E, N, K) + w1_bias: torch.Tensor, # (E, 2N) + w2_bias: torch.Tensor, # (E, K) + topk_weights: torch.Tensor, # (M, topk) + topk_ids: torch.Tensor, # (M, topk) +): + w1 = w1[topk_ids, ...] + w1_bias = w1_bias[topk_ids, ...] + hidden_states = torch.einsum("bekc,bk->bec", w1, hidden_states) + w1_bias + hidden_states = swiglu(hidden_states, limit=7) + + w2 = w2[topk_ids, ...] + w2_bias = w2_bias[topk_ids, ...] + hidden_states = torch.einsum("bekc,bek->bec", w2, hidden_states) + w2_bias + + # Weighted sum of experts + hidden_states = torch.einsum("bec,be->bc", hidden_states, topk_weights) + return hidden_states + + +def oai_triton_moe_impl( + x: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: "PrecisionConfig", + w2_scale: "PrecisionConfig", + w1_bias: torch.Tensor | None, + w2_bias: torch.Tensor | None, + num_experts: int, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + unfused: bool = False, +) -> torch.Tensor: + quant_config = mxfp4_w4a16_moe_quant_config( + w1_bias=w1_bias, + w2_bias=w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) + + if unfused: + fused_experts = UnfusedOAITritonExperts(quant_config) + else: + fused_experts = OAITritonExperts(quant_config) + + mk = FusedMoEModularKernel(MoEPrepareAndFinalizeNoEP(), fused_experts) + + return mk.forward( + hidden_states=x, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation="swigluoai", + global_num_experts=num_experts, + expert_map=None, + apply_router_weight_on_input=False, + ) + + +@pytest.mark.skipif( + not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform." +) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("m,n,k", MNK) +@pytest.mark.parametrize("num_experts", [32, 128]) +@pytest.mark.parametrize("topk", [4]) +@pytest.mark.parametrize("unfused", [True, False]) +def test_oai_triton_moe( + dtype: torch.dtype, + m: int, + n: int, + k: int, + num_experts: int, + topk: int, + unfused: bool, +): + current_platform.seed_everything(0) + ( + w1, + w2, + w1_bias, + w2_bias, + w1_tri, + w2_tri, + w1_bias_tri, + w2_bias_tri, + w1_precision_config, + w2_precision_config, + ) = make_weights(dtype, k, n, num_experts) + + x = torch.randn((m, k), dtype=dtype, device="cuda") + router_logits = torch.randn(m, num_experts, device="cuda", dtype=dtype) + topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True) + topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1) + + with set_current_vllm_config(VllmConfig()): + out_ref = torch_moe_impl(x, w1, w2, w1_bias, w2_bias, topk_weights, topk_ids) + + out = oai_triton_moe_impl( + x, + w1_tri, + w2_tri, + w1_precision_config, + w2_precision_config, + w1_bias_tri, + w2_bias_tri, + num_experts, + topk_weights, + topk_ids, + unfused, + ) + + assert_close(ref=out_ref, tri=out, maxtol=0.025, rmstol=0.005) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 3ad19370962ab..24cab79a72443 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -20,15 +20,24 @@ from vllm.model_executor.layers.fused_moe.config import ( _get_config_dtype_str, ) from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - modular_marlin_fused_moe, + MarlinExperts, ) from vllm.model_executor.layers.fused_moe.fused_moe import ( - modular_triton_fused_moe, + TritonExperts, try_get_optimal_moe_config, ) from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import ( FusedMoEModularMethod, ) +from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + UnfusedOAITritonExperts, +) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel, +) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) from .utils import _get_lora_device @@ -114,15 +123,23 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.base_layer.ensure_moe_quant_config_init() quant_config = self.base_layer.quant_method.moe_quant_config - m_fused_moe_fn = ( - modular_triton_fused_moe( - quant_config, shared_experts=self.base_layer.shared_experts - ) - if not quant_config.use_mxfp4_w4a16 - else modular_marlin_fused_moe( - quant_config, shared_experts=self.base_layer.shared_experts - ) + prepare_finalize = MoEPrepareAndFinalizeNoEP() + m_fused_moe_fn = FusedMoEModularKernel( + prepare_finalize, + self.base_layer.quant_method.select_gemm_impl( + prepare_finalize, self.base_layer + ), + self.base_layer.shared_experts, + getattr(self.base_layer, "shared_experts_stream", None), ) + if quant_config.use_mxfp4_w4a16: + assert isinstance( + m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts) + ) + else: + assert isinstance( + m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts) + ) def fwd_decorator(layer, func): def wrapper(*args, **kwargs): diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 128507639fdfd..0b006e15632e1 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -5,6 +5,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, @@ -376,3 +377,148 @@ class OAITritonExperts(BaseOAITritonExperts): intermediate_cache=workspace2, a1q_scale=a1q_scale, ) + + +class UnfusedOAITritonExperts(BaseOAITritonExperts): + """ + A Triton based MoE expert class that operates on expert standard + format and explicitly keeps the activation and reduction (moe_sum) steps + unfused from the matmul_ogs kernel. This exposes injection points + for activation and moe_sum. + + One use case for it is to inject LoRA modules on the activation and moe_sum. + """ + + def __init__(self, quant_config: FusedMoEQuantConfig): + # TODO (varun) : Enable activation quantization + assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16" + super().__init__(quant_config) + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_chunking(self) -> bool: + return True + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # workspace are allocated inside the kernel + workspace1 = (M * topk, N // 2) + workspace2 = (M * topk, max(N, K)) + output = (M, K) + return (workspace1, workspace2, output) + + def moe_sum(self, input: torch.Tensor, output: torch.Tensor): + ops.moe_sum(input, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + if self.quant_config is None: + self.quant_config = FUSED_MOE_UNQUANTIZED_CONFIG + + if expert_map is not None: + topk_ids = expert_map[topk_ids] + + local_num_experts = w1.size(0) + if global_num_experts == -1: + global_num_experts = local_num_experts + + routing_data, gather_indx, scatter_indx = self._make_routing_data( + topk_ids, topk_weights, local_num_experts + ) + + topk = topk_ids.size(1) + + # type check, uint8 means mxfp4 + assert hidden_states.dtype == torch.bfloat16 + assert ( + self.quant_config.w1_bias is None + or self.quant_config.w1_bias.dtype == torch.float32 + ) + assert ( + self.quant_config.w2_bias is None + or self.quant_config.w2_bias.dtype == torch.float32 + ) + + # Shape check, only check non-mxfp4 + assert hidden_states.ndim == 2 + assert hidden_states.shape[-1] == w1.shape[-2] + assert w2.shape[-1] == w1.shape[1] + + batch_dim = 1 + M, K = hidden_states.shape + E, _, N = w1.shape + + if global_num_experts == -1: + global_num_experts = E + + # Note that the output tensor might be in workspace13 + intermediate_cache1 = _resize_cache(workspace2, (batch_dim, M * topk, N)) + intermediate_cache3 = _resize_cache(workspace2, (batch_dim, M * topk, K)) + intermediate_cache2 = _resize_cache(workspace13, (M * topk, N // 2)) + + gammas = routing_data.gate_scal if routing_data else None + + matmul_ogs( + hidden_states, + w1, + self.quant_config.w1_bias, + routing_data, + gather_indx=gather_indx, + precision_config=self.quant_config.w1_precision, + gammas=gammas if apply_router_weight_on_input else None, + fused_activation=None, + y=intermediate_cache1, + ) + + self.activation( + activation, intermediate_cache2, intermediate_cache1.view(-1, N) + ) + + # matmul_ogs grouped reduction fuse sum across multiple experts: + # y[dst_ind // n_expts_act, :] += x[src_ind, :] + # Need to set n_expts_act to 1 to unfuse moe_sum + routing_data.n_expts_act = 1 + + matmul_ogs( + intermediate_cache2, + w2, + self.quant_config.w2_bias, + routing_data, + scatter_indx=scatter_indx, + precision_config=self.quant_config.w2_precision, + gammas=None if apply_router_weight_on_input else gammas, + y=intermediate_cache3, + ) + + self.moe_sum(intermediate_cache3.view(-1, topk, K), output) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index bc241ac692e23..74036753496d4 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -30,6 +30,7 @@ from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( ) from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( OAITritonExperts, + UnfusedOAITritonExperts, ) from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod @@ -83,8 +84,21 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend: if not current_platform.is_cuda(): return Mxfp4Backend.NONE - logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend") - return Mxfp4Backend.MARLIN + # If FlashInfer is not available, try either Marlin or Triton + triton_kernels_supported = ( + has_triton_kernels() + and is_torch_equal_or_newer("2.8.0") + # NOTE: triton_kernels are only confirmed to work on SM90 and SM100 + # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317 + # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498 + and (9, 0) <= current_platform.get_device_capability() < (11, 0) + ) + if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported: + logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend") + return Mxfp4Backend.MARLIN + + logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend") + return Mxfp4Backend.TRITON def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: @@ -854,6 +868,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): elif self.mxfp4_backend == Mxfp4Backend.MARLIN: return MarlinExperts(self.moe_quant_config) elif self.mxfp4_backend == Mxfp4Backend.TRITON: + if self.moe.is_lora_enabled: + return UnfusedOAITritonExperts(self.moe_quant_config) return OAITritonExperts(self.moe_quant_config) else: raise NotImplementedError( From 18523b87f67b12e9044d690dfe9da7cddc390627 Mon Sep 17 00:00:00 2001 From: Wilson Wu Date: Fri, 28 Nov 2025 10:53:55 +0800 Subject: [PATCH 02/98] [Docs] Update supported models for Olmo 3 in tool calling documentation (#29411) Signed-off-by: Wilson Wu --- docs/features/tool_calling.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index dd79ba19b7247..22dda37279ac6 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -371,7 +371,8 @@ Olmo 3 models output tool calls in a format that is very similar to the one expe Supported models: -* TODO (will be updated after Olmo 3 release) +* `allenai/Olmo-3-7B-Instruct` +* `allenai/Olmo-3-32B-Think` Flags: `--tool-call-parser olmo3` From c7ba1f6bc762af8f231e6ee885725e7401d74578 Mon Sep 17 00:00:00 2001 From: maang-h <55082429+maang-h@users.noreply.github.com> Date: Fri, 28 Nov 2025 13:42:30 +0800 Subject: [PATCH 03/98] [BugFix] Fix ValueError in NewRequestData repr methods (#29392) Signed-off-by: maang --- tests/v1/core/test_output.py | 36 ++++++++++++++++++++++++++++++++++++ vllm/v1/core/sched/output.py | 8 ++++++-- 2 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 tests/v1/core/test_output.py diff --git a/tests/v1/core/test_output.py b/tests/v1/core/test_output.py new file mode 100644 index 0000000000000..9dea19320e613 --- /dev/null +++ b/tests/v1/core/test_output.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch + +from vllm.v1.core.sched.output import NewRequestData + + +def _create_new_requests_data(prompt_embeds: torch.Tensor | None) -> NewRequestData: + return NewRequestData( + req_id="test_req", + prompt_token_ids=None, + mm_features=[], + sampling_params=None, + pooling_params=None, + block_ids=([],), + num_computed_tokens=0, + lora_request=None, + prompt_embeds=prompt_embeds, + ) + + +def test_repr_with_none() -> None: + """Test repr when prompt_embeds is None.""" + new_requests_data = _create_new_requests_data(None) + + assert "prompt_embeds_shape=None" in repr(new_requests_data) + assert "prompt_embeds_shape=None" in new_requests_data.anon_repr() + + +def test_repr_with_multi_element_tensor() -> None: + """Test repr when prompt_embeds is a multi-element tensor.""" + prompt_embeds = torch.randn(10, 768) + new_requests_data = _create_new_requests_data(prompt_embeds) + + assert "prompt_embeds_shape=torch.Size([10, 768])" in repr(new_requests_data) + assert "prompt_embeds_shape=torch.Size([10, 768])" in new_requests_data.anon_repr() diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index abfab43499b2a..b69fa87ebddc8 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -68,7 +68,9 @@ class NewRequestData: ) def __repr__(self) -> str: - prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None + prompt_embeds_shape = ( + self.prompt_embeds.shape if self.prompt_embeds is not None else None + ) return ( f"NewRequestData(" f"req_id={self.req_id}," @@ -88,7 +90,9 @@ class NewRequestData: prompt_token_ids_len = ( len(self.prompt_token_ids) if self.prompt_token_ids is not None else None ) - prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None + prompt_embeds_shape = ( + self.prompt_embeds.shape if self.prompt_embeds is not None else None + ) return ( f"NewRequestData(" f"req_id={self.req_id}," From 37b15e97e8443a7fd76f5aa95a78d5593f7241a4 Mon Sep 17 00:00:00 2001 From: EanWang211123 Date: Fri, 28 Nov 2025 14:05:45 +0800 Subject: [PATCH 04/98] [Multimodal][Speculative Decoding]Eagle3 mm support, enablement on qwen3vl (#29594) Signed-off-by: Tsai, Louie Signed-off-by: EanWang211123 Co-authored-by: Louie Tsai Co-authored-by: Cyrus Leung --- tests/models/registry.py | 4 ++++ tests/v1/e2e/test_spec_decode.py | 14 ++++++++++++++ vllm/model_executor/models/qwen3_vl.py | 23 ++++++++++++++++++++++- vllm/model_executor/models/registry.py | 1 + vllm/v1/spec_decode/eagle.py | 8 ++++---- 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index c9d4823d52792..1f4a106c06b4b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -913,6 +913,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "Qwen/Qwen2.5-VL-7B-Instruct", speculative_model="Rayzl/qwen2.5-vl-7b-eagle3-sgl", ), + "Eagle3Qwen3vlForCausalLM": _HfExamplesInfo( + "Qwen/Qwen3-VL-8B-Instruct", + speculative_model="taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", + ), "Qwen3NextMTP": _HfExamplesInfo( "Qwen/Qwen3-Next-80B-A3B-Instruct", min_transformers_version="4.56.3" ), diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 03396270a31cb..3a25f7411eecd 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -283,6 +283,19 @@ def test_speculators_model_integration( ["model_setup", "mm_enabled", "enable_chunked_prefill"], [ (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False), + pytest.param( + ( + "eagle3", + "Qwen/Qwen3-VL-8B-Instruct", + "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", + 1, + ), + False, + False, + marks=pytest.mark.skip( + reason="architecture of its eagle3 is LlamaForCausalLMEagle3" + ), + ), pytest.param( ( "eagle3", @@ -352,6 +365,7 @@ def test_speculators_model_integration( ], ids=[ "qwen3_eagle3", + "qwen3_vl_eagle3", "qwen2_5_vl_eagle3", "llama3_eagle", "llama3_eagle3", diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 4cd6fa14c32df..52d31e70a8f05 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -89,6 +89,7 @@ from vllm.utils.collection_utils import is_list_of from .interfaces import ( MultiModalEmbeddings, + SupportsEagle3, SupportsLoRA, SupportsMRoPE, SupportsMultiModal, @@ -1122,9 +1123,14 @@ class Qwen3LLMModel(Qwen3Model): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + + aux_hidden_states = [] for layer_idx, layer in islice( enumerate(self.layers), self.start_layer, self.end_layer ): + if layer_idx in self.aux_hidden_state_layers: + aux_hidden_states.append(hidden_states + residual) + hidden_states, residual = layer( positions, hidden_states, @@ -1144,6 +1150,9 @@ class Qwen3LLMModel(Qwen3Model): {"hidden_states": hidden_states, "residual": residual} ) hidden_states, _ = self.norm(hidden_states, residual) + + if len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states return hidden_states @@ -1186,7 +1195,12 @@ class Qwen3LLMForCausalLM(Qwen3ForCausalLM): dummy_inputs=Qwen3VLDummyInputsBuilder, ) class Qwen3VLForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE + nn.Module, + SupportsMultiModal, + SupportsLoRA, + SupportsPP, + SupportsMRoPE, + SupportsEagle3, ): merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"} @@ -1279,6 +1293,13 @@ class Qwen3VLForConditionalGeneration( self.visual_dim = config.vision_config.out_hidden_size self.multiscale_dim = self.visual_dim * self.deepstack_num_level + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.language_model.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = len(self.language_model.model.layers) + return (2, num_layers // 2, num_layers - 3) + def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors: # get deepstack_input_embeds from buffer, and clear the buffer return IntermediateTensors( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index ba9f33819c950..0d582043e8c02 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -414,6 +414,7 @@ _SPECULATIVE_DECODING_MODELS = { "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), + "Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"), diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 7600df48150ac..305abdade8da6 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1017,10 +1017,10 @@ class EagleProposer: if supports_multimodal(target_model): # handle multimodality - if ( - self.get_model_name(target_model) - == "Qwen2_5_VLForConditionalGeneration" - ): + if self.get_model_name(target_model) in [ + "Qwen2_5_VLForConditionalGeneration", + "Qwen3VLForConditionalGeneration", + ]: self.model.config.image_token_index = target_model.config.image_token_id else: self.model.config.image_token_index = ( From f4b76056ee5c3a3f917527da5be3786e1b8530c6 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 28 Nov 2025 14:05:48 +0800 Subject: [PATCH 05/98] Improve enable chunked_prefill & prefix_caching logic. (#26623) Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi Co-authored-by: Cyrus Leung --- .../pooling/test_auto_prefix_cache_support.py | 4 +- tests/test_config.py | 240 +++++++++++++++++- vllm/config/model.py | 109 ++++++++ vllm/config/pooler.py | 6 +- vllm/config/vllm.py | 76 ++---- vllm/engine/arg_utils.py | 90 +++---- vllm/model_executor/models/bert.py | 4 +- vllm/model_executor/models/interfaces_base.py | 35 ++- vllm/model_executor/models/modernbert.py | 3 +- vllm/model_executor/models/registry.py | 15 +- vllm/v1/engine/core.py | 7 +- 11 files changed, 456 insertions(+), 133 deletions(-) diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py index 0904c7e877ef4..3795f2a5d8664 100644 --- a/tests/models/language/pooling/test_auto_prefix_cache_support.py +++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py @@ -105,8 +105,6 @@ def test_embed_models( def test_non_causal_models( hf_runner, vllm_runner, example_prompts, model: str, dtype: str ) -> None: - with vllm_runner( - model, max_model_len=512, dtype=dtype, enable_prefix_caching=True - ) as vllm_model: + with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model: cache_config = vllm_model.llm.llm_engine.cache_config assert not cache_config.enable_prefix_caching diff --git a/tests/test_config.py b/tests/test_config.py index 080e4d2afacc6..112b02edd0389 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import logging import os from dataclasses import MISSING, Field, asdict, dataclass, field from unittest.mock import patch @@ -602,6 +602,244 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files): assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer) +@pytest.mark.parametrize( + ("model_id", "expected_attn_type", "expected_result", "reason"), + [ + # pooling models + ( + "jason9693/Qwen2.5-1.5B-apeach", + "decoder", + True, + "Pooling models with causal attn and last pooling support chunked prefill.", + ), + ( + "Qwen/Qwen3-Embedding-0.6B", + "decoder", + True, + "Pooling models with causal attn and last pooling support chunked prefill.", + ), + ( + "Qwen/Qwen2.5-Math-PRM-7B", + "decoder", + False, + "Pooling models with step pooling does not support chunked prefill.", + ), + ( + "internlm/internlm2-1_8b-reward", + "decoder", + False, + "Pooling models with all pooling does not support chunked prefill.", + ), + ( + "BAAI/bge-base-en", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + ( + "boltuix/NeuroBERT-NER", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + ( + "papluca/xlm-roberta-base-language-detection", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + ( + "Alibaba-NLP/gte-Qwen2-1.5B-instruct", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + ( + "intfloat/e5-small", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + # multimodal models + ( + "openai/clip-vit-base-patch32", + "decoder", + True, + "Pooling models with causal attn and last pooling support chunked prefill.", + ), + ( + "google/siglip-base-patch16-224", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support chunked prefill.", + ), + # generate models + ( + "Qwen/Qwen3-0.6B", + "decoder", + True, + "Generative models support chunked prefill.", + ), + ( + "Qwen/Qwen3-Next-80B-A3B-Instruct", + "hybrid", + True, + "Generative models support chunked prefill.", + ), + ( + "ibm-granite/granite-4.0-h-small", + "hybrid", + True, + "Generative models support chunked prefill.", + ), + ( + "state-spaces/mamba-130m-hf", + "attention_free", + True, + "Generative models support chunked prefill.", + ), + # encoder_decoder models + ( + "openai/whisper-small", + "encoder_decoder", + False, + "Encoder decoder models does not support chunked prefill.", + ), + ], +) +def test_is_chunked_prefill_supported( + model_id: str, + expected_attn_type: str, + expected_result: bool, + reason: str, + caplog_vllm, +): + model_config = ModelConfig(model_id, trust_remote_code=True) + assert model_config.attn_type == expected_attn_type + with caplog_vllm.at_level(level=logging.DEBUG): + assert model_config.is_chunked_prefill_supported == expected_result + assert reason in caplog_vllm.text + + +@pytest.mark.parametrize( + ("model_id", "expected_attn_type", "expected_result", "reason"), + [ + # pooling models + ( + "jason9693/Qwen2.5-1.5B-apeach", + "decoder", + True, + "Pooling models with causal attn and last pooling support prefix caching.", + ), + ( + "Qwen/Qwen3-Embedding-0.6B", + "decoder", + True, + "Pooling models with causal attn and last pooling support prefix caching.", + ), + ( + "Qwen/Qwen2.5-Math-PRM-7B", + "decoder", + False, + "Pooling models with step pooling does not support prefix caching.", + ), + ( + "internlm/internlm2-1_8b-reward", + "decoder", + False, + "Pooling models with all pooling does not support prefix caching.", + ), + ( + "BAAI/bge-base-en", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + ( + "boltuix/NeuroBERT-NER", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + ( + "papluca/xlm-roberta-base-language-detection", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + ( + "Alibaba-NLP/gte-Qwen2-1.5B-instruct", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + ( + "intfloat/e5-small", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + # multimodal models + ( + "openai/clip-vit-base-patch32", + "decoder", + True, + "Pooling models with causal attn and last pooling support prefix caching.", + ), + ( + "google/siglip-base-patch16-224", + "encoder_only", + False, + "Pooling models with bidirectional attn does not support prefix caching.", + ), + # generate models + ( + "Qwen/Qwen3-0.6B", + "decoder", + True, + "Generative models support prefix caching.", + ), + ( + "Qwen/Qwen3-Next-80B-A3B-Instruct", + "hybrid", + False, + "Hybrid models does not support prefix caching since the feature is still experimental.", # noqa: E501 + ), + ( + "ibm-granite/granite-4.0-h-small", + "hybrid", + False, + "Hybrid models does not support prefix caching since the feature is still experimental.", # noqa: E501 + ), + ( + "state-spaces/mamba-130m-hf", + "attention_free", + False, + "Attention free models does not support prefix caching since the feature is still experimental.", # noqa: E501 + ), + # encoder_decoder models + ( + "openai/whisper-small", + "encoder_decoder", + False, + "Encoder decoder models does not support prefix caching.", + ), + ], +) +def test_is_prefix_caching_supported( + model_id: str, + expected_attn_type: str, + expected_result: bool, + reason: str, + caplog_vllm, +): + model_config = ModelConfig(model_id, trust_remote_code=True) + assert model_config.attn_type == expected_attn_type + with caplog_vllm.at_level(level=logging.DEBUG): + assert model_config.is_prefix_caching_supported == expected_result + assert reason in caplog_vllm.text + + @pytest.mark.parametrize( ("backend", "custom_ops", "expected"), [ diff --git a/vllm/config/model.py b/vllm/config/model.py index 21d602b30ac1a..b9ae4fec14efa 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -107,6 +107,10 @@ _RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { "draft": [], } +AttnTypeStr = Literal[ + "decoder", "encoder", "encoder_only", "encoder_decoder", "attention_free", "hybrid" +] + @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) @@ -1752,6 +1756,111 @@ class ModelConfig: logger.info("Using max model len %s", max_model_len) return max_model_len + @property + def attn_type(self) -> AttnTypeStr: + if self.pooler_config is not None: + pooling_type = self._model_info.default_pooling_type.lower() + if pooling_type == "cls": + return "encoder_only" + else: + is_causal = getattr(self.hf_config, "is_causal", True) + return "encoder_only" if not is_causal else self._model_info.attn_type + elif self.is_hybrid: + return "hybrid" + elif self.is_attention_free: + return "attention_free" + elif self.is_encoder_decoder: + return "encoder_decoder" + else: + return "decoder" + + @property + def is_chunked_prefill_supported(self) -> bool: + attn_type = self.attn_type + if self.pooler_config is not None: + # for pooling models + if attn_type == "encoder_only": + logger.debug( + "Pooling models with bidirectional attn does not support " + "chunked prefill." + ) + return False + elif attn_type == "decoder": + pooling_type = self.pooler_config.pooling_type.lower() + if pooling_type in ["all", "mean", "step", "cls"]: + logger.debug( + "Pooling models with %s pooling does not " + "support chunked prefill.", + pooling_type, + ) + return False + else: + # pooling_type == "last" + logger.debug( + "Pooling models with causal attn and last pooling support " + "chunked prefill." + ) + return True + # vllm currently does not have pooling models using hybrid, + # attention_free or encoder_decoder attn types. + return attn_type != "encoder_decoder" + else: + if attn_type == "encoder_decoder": + logger.debug("Encoder decoder models does not support chunked prefill.") + return False + logger.debug("Generative models support chunked prefill.") + return True + + @property + def is_prefix_caching_supported(self) -> bool: + attn_type = self.attn_type + if self.pooler_config is not None: + # for pooling models + if attn_type == "encoder_only": + logger.debug( + "Pooling models with bidirectional attn does not " + "support prefix caching." + ) + return False + elif attn_type == "decoder": + pooling_type = self.pooler_config.pooling_type.lower() + if pooling_type in ["all", "mean", "step", "cls"]: + logger.debug( + "Pooling models with %s pooling does not " + "support prefix caching.", + pooling_type, + ) + return False + else: + # pooling_type == "last" + logger.debug( + "Pooling models with causal attn and last pooling support " + "prefix caching." + ) + return True + # vllm currently does not have pooling models using hybrid, + # attention_free or encoder_decoder attn types. + return False + else: + if attn_type == "hybrid": + logger.debug( + "Hybrid models does not support prefix caching since the feature " + "is still experimental." + ) + return False + elif attn_type == "attention_free": + logger.debug( + "Attention free models does not support prefix caching since the " + "feature is still experimental." + ) + return False + elif attn_type == "encoder_decoder": + logger.debug("Encoder decoder models does not support prefix caching.") + return False + else: # attn_type == "decoder" + logger.debug("Generative models support prefix caching.") + return True + def is_model_moe( self, ) -> bool: diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index 85950bbcd666f..aa4e7006d0247 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any +from typing import Any, Literal from pydantic.dataclasses import dataclass @@ -11,13 +11,15 @@ from vllm.utils.hashing import safe_hash logger = init_logger(__name__) +PoolingTypeStr = Literal["LAST", "ALL", "CLS", "STEP", "MEAN"] + @config @dataclass class PoolerConfig: """Controls the behavior of output pooling in pooling models.""" - pooling_type: str | None = None + pooling_type: PoolingTypeStr | None = None """ The pooling method of the pooling model. This should be a key in [`vllm.model_executor.layers.pooler.PoolingType`][]. diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index c576275e80fe3..7ac8cc764322e 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -721,65 +721,27 @@ class VllmConfig: "correctness and to realize prefill savings. " ) - disable_chunked_prefill_reasons: list[str] = [] + if self.model_config and self.model_config.is_encoder_decoder: + from vllm.multimodal import MULTIMODAL_REGISTRY - if self.model_config: - if self.model_config.pooler_config: - pooling_type = self.model_config.pooler_config.pooling_type - if pooling_type is None or pooling_type.lower() != "last": - disable_chunked_prefill_reasons.append( - 'Only "last" pooling supports chunked ' - "prefill and prefix caching; disabling both." - ) - if not getattr(self.model_config.hf_config, "is_causal", True): - disable_chunked_prefill_reasons.append( - "Only models using causal attention support chunked " - "prefill and prefix caching; disabling both." - ) - elif self.model_config.is_encoder_decoder: - from vllm.multimodal import MULTIMODAL_REGISTRY - - self.scheduler_config.max_num_encoder_input_tokens = ( - MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) - ) - logger.debug( - "Encoder-decoder model detected: setting " - "`max_num_encoder_input_tokens` to encoder length (%s)", - self.scheduler_config.max_num_encoder_input_tokens, - ) - if ( - self.model_config.architecture == "WhisperForConditionalGeneration" - and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" - ): - logger.warning( - "Whisper is known to have issues with " - "forked workers. If startup is hanging, " - "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " - "to 'spawn'." - ) - - # Final off-switch for CP/APC: - # Disable for (a) collected blockers, (b) encoder–decoder, or - # (c) explicit CP=False when APC wasn't requested. - # Do NOT disable merely because the resolved CP flag is False. - apc_requested = ( - self.cache_config is not None and self.cache_config.enable_prefix_caching - ) - if ( - disable_chunked_prefill_reasons - or (self.model_config is not None and self.model_config.is_encoder_decoder) - or ( - self.scheduler_config.enable_chunked_prefill is False - and not apc_requested + self.scheduler_config.max_num_encoder_input_tokens = ( + MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) ) - ): - for reason in disable_chunked_prefill_reasons: - logger.info(reason) - self.scheduler_config.enable_chunked_prefill = False - self.scheduler_config.long_prefill_token_threshold = 0 - - if self.cache_config is not None: - self.cache_config.enable_prefix_caching = False + logger.debug( + "Encoder-decoder model detected: setting " + "`max_num_encoder_input_tokens` to encoder length (%s)", + self.scheduler_config.max_num_encoder_input_tokens, + ) + if ( + self.model_config.architecture == "WhisperForConditionalGeneration" + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" + ): + logger.warning( + "Whisper is known to have issues with " + "forked workers. If startup is hanging, " + "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " + "to 'spawn'." + ) if ( self.kv_events_config is not None diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e4c9a82d25223..ad5a34c56161c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1349,30 +1349,10 @@ class EngineArgs: self.tokenizer = model_config.tokenizer self._check_feature_supported(model_config) - - # Set default arguments for V1 Engine. - self._set_default_args(usage_context, model_config) - # Disable chunked prefill and prefix caching for: - # POWER (ppc64le)/s390x/RISCV CPUs in V1 - if current_platform.is_cpu() and current_platform.get_cpu_architecture() in ( - CpuArchEnum.POWERPC, - CpuArchEnum.S390X, - CpuArchEnum.RISCV, - ): - logger.info( - "Chunked prefill is not supported for ARM and POWER, " - "S390X and RISC-V CPUs; " - "disabling it for V1 backend." - ) - self.enable_chunked_prefill = False - logger.info( - "Prefix caching is not supported for ARM and POWER, " - "S390X and RISC-V CPUs; " - "disabling it for V1 backend." - ) - self.enable_prefix_caching = False - - assert self.enable_chunked_prefill is not None + self._set_default_chunked_prefill_and_prefix_caching_args(model_config) + self._set_default_max_num_seqs_and_batched_tokens_args( + usage_context, model_config + ) sliding_window: int | None = None if not is_interleaved(model_config.hf_text_config): @@ -1805,34 +1785,6 @@ class EngineArgs: ) _raise_unsupported_error(feature_name=name) - @classmethod - def get_chunked_prefill_prefix_caching_defaults( - cls, - model_config: ModelConfig, - ) -> tuple[bool, bool]: - if model_config.runner_type != "pooling": - default_chunked_prefill = True - - # Disable prefix caching default for hybrid models and mamba-only - # models since the feature is still experimental. - default_prefix_caching = not ( - model_config.is_hybrid or model_config.is_attention_free - ) - else: - assert model_config.pooler_config is not None - - pooling_type = model_config.pooler_config.pooling_type - incremental_prefill_supported = ( - pooling_type is not None - and pooling_type.lower() == "last" - and getattr(model_config.hf_config, "is_causal", True) - ) - - default_chunked_prefill = incremental_prefill_supported - default_prefix_caching = incremental_prefill_supported - - return default_chunked_prefill, default_prefix_caching - @classmethod def get_batch_defaults( cls, @@ -1916,14 +1868,11 @@ class EngineArgs: return default_max_num_batched_tokens, default_max_num_seqs - def _set_default_args( - self, usage_context: UsageContext, model_config: ModelConfig + def _set_default_chunked_prefill_and_prefix_caching_args( + self, model_config: ModelConfig ) -> None: - """Set Default Arguments for V1 Engine.""" - ( - default_chunked_prefill, - default_prefix_caching, - ) = self.get_chunked_prefill_prefix_caching_defaults(model_config) + default_chunked_prefill = model_config.is_chunked_prefill_supported + default_prefix_caching = model_config.is_prefix_caching_supported if self.prefill_context_parallel_size > 1: default_chunked_prefill = False @@ -1984,6 +1933,29 @@ class EngineArgs: scope="local", ) + # Disable chunked prefill and prefix caching for: + # POWER (ppc64le)/s390x/RISCV CPUs in V1 + if current_platform.is_cpu() and current_platform.get_cpu_architecture() in ( + CpuArchEnum.POWERPC, + CpuArchEnum.S390X, + CpuArchEnum.RISCV, + ): + logger.info( + "Chunked prefill is not supported for ARM and POWER, " + "S390X and RISC-V CPUs; " + "disabling it for V1 backend." + ) + self.enable_chunked_prefill = False + logger.info( + "Prefix caching is not supported for ARM and POWER, " + "S390X and RISC-V CPUs; " + "disabling it for V1 backend." + ) + self.enable_prefix_caching = False + + def _set_default_max_num_seqs_and_batched_tokens_args( + self, usage_context: UsageContext, model_config: ModelConfig + ): world_size = self.pipeline_parallel_size * self.tensor_parallel_size ( default_max_num_batched_tokens, diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 2679448bce775..e774cd647ea8c 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -32,7 +32,7 @@ from vllm.tasks import PoolingTask from vllm.v1.pool.metadata import PoolingMetadata from .interfaces import SupportsCrossEncoding, SupportsQuant -from .interfaces_base import default_pooling_type +from .interfaces_base import attn_type, default_pooling_type from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -432,7 +432,6 @@ class BertModel(nn.Module, SupportsQuant): return loaded_params -@default_pooling_type("ALL") class BertPoolingModel(BertModel): is_pooling_model = True @@ -864,6 +863,7 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQu ) +@attn_type("encoder_only") @default_pooling_type("ALL") class BertForTokenClassification(nn.Module): is_pooling_model = True diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 85c5574bacf0a..2c99fce8d918c 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -19,10 +19,14 @@ from vllm.utils.func_utils import supports_kw if TYPE_CHECKING: from vllm.config import VllmConfig + from vllm.config.model import AttnTypeStr + from vllm.config.pooler import PoolingTypeStr from vllm.model_executor.layers.pooler import Pooler else: VllmConfig = Any Pooler = Any + PoolingTypeStr = Any + AttnTypeStr = Any logger = init_logger(__name__) @@ -165,7 +169,7 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]): MRO of your model class. """ - default_pooling_type: ClassVar[str] = "LAST" + default_pooling_type: ClassVar[PoolingTypeStr] = "LAST" """ Indicates the [vllm.config.pooler.PoolerConfig.pooling_type][] to use by default. @@ -175,6 +179,17 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]): decorator to conveniently set this field. """ + attn_type: ClassVar[AttnTypeStr] = "decoder" + """ + Indicates the + [vllm.config.model.ModelConfig.attn_type][] + to use by default. + + You can use the + [vllm.model_executor.models.interfaces_base.attn_type][] + decorator to conveniently set this field. + """ + pooler: Pooler """The pooler is only called on TP rank 0.""" @@ -199,7 +214,7 @@ def is_pooling_model( _T = TypeVar("_T", bound=type[nn.Module]) -def default_pooling_type(pooling_type: str): +def default_pooling_type(pooling_type: PoolingTypeStr): """Decorator to set `VllmModelForPooling.default_pooling_type`.""" def func(model: _T) -> _T: @@ -209,5 +224,19 @@ def default_pooling_type(pooling_type: str): return func -def get_default_pooling_type(model: type[object] | object) -> str: +def get_default_pooling_type(model: type[object] | object) -> PoolingTypeStr: return getattr(model, "default_pooling_type", "LAST") + + +def attn_type(attn_type: AttnTypeStr): + """Decorator to set `VllmModelForPooling.attn_type`.""" + + def func(model: _T) -> _T: + model.attn_type = attn_type # type: ignore + return model + + return func + + +def get_attn_type(model: type[object] | object) -> AttnTypeStr: + return getattr(model, "attn_type", "decoder") diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 3a8a6c74d9d15..743bc23d9876f 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -28,7 +28,7 @@ from vllm.tasks import PoolingTask from vllm.v1.pool.metadata import PoolingMetadata from .interfaces import SupportsCrossEncoding -from .interfaces_base import default_pooling_type +from .interfaces_base import attn_type, default_pooling_type from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix @@ -396,6 +396,7 @@ class ModernBertPredictionHead(nn.Module): return self.norm(self.act(self.dense(hidden_states))) +@attn_type("encoder_only") @default_pooling_type("ALL") class ModernBertForTokenClassification(nn.Module): is_pooling_model = True diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 0d582043e8c02..73a61f1148b50 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -17,7 +17,7 @@ from collections.abc import Callable, Set from dataclasses import asdict, dataclass, field from functools import lru_cache from pathlib import Path -from typing import TypeVar +from typing import TYPE_CHECKING, Any, TypeVar import torch.nn as nn import transformers @@ -33,6 +33,14 @@ from vllm.logging_utils import logtime from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module from vllm.utils.hashing import safe_hash +if TYPE_CHECKING: + from vllm.config.model import AttnTypeStr + from vllm.config.pooler import PoolingTypeStr +else: + AttnTypeStr = Any + PoolingTypeStr = Any + + from .interfaces import ( has_inner_state, has_noops, @@ -47,6 +55,7 @@ from .interfaces import ( supports_transcription, ) from .interfaces_base import ( + get_attn_type, get_default_pooling_type, is_pooling_model, is_text_generation_model, @@ -509,7 +518,8 @@ class _ModelInfo: architecture: str is_text_generation_model: bool is_pooling_model: bool - default_pooling_type: str + attn_type: AttnTypeStr + default_pooling_type: PoolingTypeStr supports_cross_encoding: bool supports_multimodal: bool supports_multimodal_raw_input_only: bool @@ -530,6 +540,7 @@ class _ModelInfo: is_text_generation_model=is_text_generation_model(model), is_pooling_model=is_pooling_model(model), default_pooling_type=get_default_pooling_type(model), + attn_type=get_attn_type(model), supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), supports_multimodal_raw_input_only=supports_multimodal_raw_input_only( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 8657a95b5e6e7..e3a5f51a8fc56 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -119,11 +119,12 @@ class EngineCore: # Setup scheduler. Scheduler = vllm_config.scheduler_config.get_scheduler_cls() - if len(kv_cache_config.kv_cache_groups) == 0: + if len(kv_cache_config.kv_cache_groups) == 0: # noqa: SIM102 # Encoder models without KV cache don't support # chunked prefill. But do SSM models? - logger.info("Disabling chunked prefill for model without KVCache") - vllm_config.scheduler_config.enable_chunked_prefill = False + if vllm_config.scheduler_config.enable_chunked_prefill: + logger.warning("Disabling chunked prefill for model without KVCache") + vllm_config.scheduler_config.enable_chunked_prefill = False scheduler_block_size = ( vllm_config.cache_config.block_size From b34e8775a31c1a077a1a24f22ffbf048b2a979f6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 28 Nov 2025 14:43:18 +0800 Subject: [PATCH 06/98] Revert "[CPU]Update CPU PyTorch to 2.9.0 (#29589)" (#29647) Signed-off-by: DarkLight1337 --- docker/Dockerfile.cpu | 4 ++++ requirements/cpu-build.txt | 4 ++-- requirements/cpu.txt | 8 ++++---- vllm/model_executor/models/qwen3_vl.py | 4 ++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 67d3fb83a0275..eb3807ef0ca4e 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -119,6 +119,7 @@ FROM base AS vllm-test-deps WORKDIR /workspace/vllm +# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ cp requirements/test.in requirements/cpu-test.in && \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ @@ -131,6 +132,9 @@ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ esac; \ }; \ remove_packages_not_supported_on_aarch64 && \ + sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \ + sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \ + sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \ uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu RUN --mount=type=cache,target=/root/.cache/uv \ diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt index 0c6fdd3b33cd1..81d429a5e5f8d 100644 --- a/requirements/cpu-build.txt +++ b/requirements/cpu-build.txt @@ -4,9 +4,9 @@ packaging>=24.2 setuptools>=77.0.3,<81.0.0 setuptools-scm>=8 --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" +torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" torch==2.9.0; platform_system == "Darwin" -torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64" +torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL) wheel jinja2>=3.1.6 diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 8c04d6d5ce1b0..e23d3286f3f78 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -7,17 +7,17 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d packaging>=24.2 setuptools>=77.0.3,<81.0.0 --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.9.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" +torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" torch==2.9.0; platform_system == "Darwin" -torch==2.9.0; platform_machine == "ppc64le" or platform_machine == "aarch64" +torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" -torchaudio==2.9.0; platform_machine == "ppc64le" +torchaudio==2.8.0; platform_machine == "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" -torchvision==0.24.0; platform_machine == "ppc64le" +torchvision==0.23.0; platform_machine == "ppc64le" datasets # for benchmark scripts # Intel Extension for PyTorch, only for x86_64 CPUs diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 52d31e70a8f05..39fe8336b84a1 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1123,14 +1123,14 @@ class Qwen3LLMModel(Qwen3Model): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - + aux_hidden_states = [] for layer_idx, layer in islice( enumerate(self.layers), self.start_layer, self.end_layer ): if layer_idx in self.aux_hidden_state_layers: aux_hidden_states.append(hidden_states + residual) - + hidden_states, residual = layer( positions, hidden_states, From 480598958e28fa1e2ed2f7be2d457fc6f85a1748 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Fri, 28 Nov 2025 15:53:20 +0800 Subject: [PATCH 07/98] [Feature][Bench] Add pareto visualization (#29477) Signed-off-by: rongfu.leng --- docs/contributing/benchmarks.md | 18 ++ docs/mkdocs/hooks/generate_argparse.py | 4 + vllm/benchmarks/sweep/cli.py | 3 + vllm/benchmarks/sweep/plot_pareto.py | 393 +++++++++++++++++++++++++ 4 files changed, 418 insertions(+) create mode 100644 vllm/benchmarks/sweep/plot_pareto.py diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index c9bc9cfe28a35..e4714e6266381 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -1146,6 +1146,24 @@ vllm bench sweep plot benchmarks/results/ \ !!! tip You can use `--dry-run` to preview the figures to be plotted. +### Pareto visualization (tokens/s/user vs tokens/s/GPU) + +`vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput. + +Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs. + +- x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`). +- y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP). +- Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`. +- Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`). + +Example: + +```bash +vllm bench sweep plot_pareto benchmarks/results/ \ + --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size +``` + ## Performance Benchmarks The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM. diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 735074c08b8c8..4ae64a6e4bfcc 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -94,6 +94,9 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100): bench_latency = auto_mock("vllm.benchmarks", "latency") bench_serve = auto_mock("vllm.benchmarks", "serve") bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs") +bench_sweep_plot_pareto = auto_mock( + "vllm.benchmarks.sweep.plot_pareto", "SweepPlotParetoArgs" +) bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs") bench_sweep_serve_sla = auto_mock( "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs" @@ -221,6 +224,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): "bench_latency": create_parser(bench_latency.add_cli_args), "bench_serve": create_parser(bench_serve.add_cli_args), "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args), + "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args), "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args), "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args), "bench_throughput": create_parser(bench_throughput.add_cli_args), diff --git a/vllm/benchmarks/sweep/cli.py b/vllm/benchmarks/sweep/cli.py index 108cd75690864..e74e0e2c181c5 100644 --- a/vllm/benchmarks/sweep/cli.py +++ b/vllm/benchmarks/sweep/cli.py @@ -6,6 +6,8 @@ from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG from .plot import SweepPlotArgs from .plot import main as plot_main +from .plot_pareto import SweepPlotParetoArgs +from .plot_pareto import main as plot_pareto_main from .serve import SweepServeArgs from .serve import main as serve_main from .serve_sla import SweepServeSLAArgs @@ -15,6 +17,7 @@ SUBCOMMANDS = ( (SweepServeArgs, serve_main), (SweepServeSLAArgs, serve_sla_main), (SweepPlotArgs, plot_main), + (SweepPlotParetoArgs, plot_pareto_main), ) diff --git a/vllm/benchmarks/sweep/plot_pareto.py b/vllm/benchmarks/sweep/plot_pareto.py new file mode 100644 index 0000000000000..70472552b5cd4 --- /dev/null +++ b/vllm/benchmarks/sweep/plot_pareto.py @@ -0,0 +1,393 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import math +from concurrent.futures import ProcessPoolExecutor +from dataclasses import dataclass +from functools import partial +from pathlib import Path +from typing import ClassVar + +from vllm.utils.collection_utils import full_groupby +from vllm.utils.import_utils import PlaceholderModule + +from .plot import DummyExecutor, _json_load_bytes +from .utils import sanitize_filename + +try: + import matplotlib.pyplot as plt + import pandas as pd + import seaborn as sns +except ImportError: + plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot") + pd = PlaceholderModule("pandas") + sns = PlaceholderModule("seaborn") + + +def _first_present(run_data: dict[str, object], keys: list[str]): + for key in keys: + for candidate in {key, key.replace("_", "-"), key.replace("-", "_")}: + if candidate in run_data: + return run_data[candidate] + return None + + +def _get_numeric( + run_data: dict[str, object], + keys: list[str], + *, + allow_zero: bool = True, +) -> float | None: + value = _first_present(run_data, keys) + if value is None: + return None + + try: + numeric = float(value) + except (TypeError, ValueError) as exc: + raise ValueError( + f"Expected numeric value for one of {keys}, " + f"but found {value!r} in {run_data=}" + ) from exc + + if not allow_zero and numeric == 0: + return None + + return numeric + + +def _infer_user_count( + run_data: dict[str, object], + user_count_var: str | None, +) -> float | None: + candidates = [user_count_var] if user_count_var else [] + candidates.extend(["request_rate"]) + user_count = _get_numeric(run_data, candidates, allow_zero=False) + if user_count is not None: + return user_count + + # Fallback to the observed peak if configured value is missing. + return _get_numeric(run_data, ["max_concurrent_requests"], allow_zero=False) + + +def _infer_gpu_count( + run_data: dict[str, object], + gpu_count_var: str | None, +) -> float: + direct_candidates = [gpu_count_var] if gpu_count_var else [] + direct_gpu_count = _get_numeric(run_data, direct_candidates, allow_zero=False) + if direct_gpu_count: + return direct_gpu_count + + tp_size = _get_numeric(run_data, ["tensor_parallel_size", "tp"]) + pp_size = _get_numeric(run_data, ["pipeline_parallel_size", "pp"]) + dp_size = _get_numeric(run_data, ["data_parallel_size", "dp"]) + world_size = 1.0 + if tp_size: + world_size *= tp_size + if pp_size: + world_size *= pp_size + if dp_size: + world_size *= dp_size + + return world_size + + +def _get_throughput( + run_data: dict[str, object], + throughput_var: str, +) -> float: + throughput = _get_numeric(run_data, [throughput_var]) + if throughput is None: + raise ValueError( + f"Cannot find throughput metric {throughput_var!r} in run data. " + f"Available keys: {sorted(run_data)}" + ) + + return throughput + + +def _prepare_records( + all_data: list[dict[str, object]], + *, + user_count_var: str | None, + gpu_count_var: str | None, +) -> tuple[list[dict[str, object]], int]: + prepared = [] + skipped_missing_users = 0 + + for record in all_data: + throughput = _get_throughput(record, "output_throughput") + user_count = _infer_user_count(record, user_count_var) + if user_count is None: + skipped_missing_users += 1 + continue + + gpu_count = _infer_gpu_count(record, gpu_count_var) + tokens_per_user = throughput / user_count + tokens_per_gpu = throughput / gpu_count + + prepared.append( + { + **record, + "tokens_per_user": tokens_per_user, + "tokens_per_gpu": tokens_per_gpu, + "user_count_estimate": user_count, + "gpu_count": gpu_count, + } + ) + + return prepared, skipped_missing_users + + +def _pareto_frontier( + df: "pd.DataFrame", + x_col: str, + y_col: str, + *, + epsilon: float = 1e-9, +) -> "pd.DataFrame": + sorted_df = df.sort_values([x_col, y_col], ascending=[False, False]) + frontier_indices = [] + best_y = -math.inf + + for idx, row in sorted_df.iterrows(): + y_val = row[y_col] + if y_val >= best_y - epsilon: + frontier_indices.append(idx) + best_y = max(best_y, y_val) + + return df.loc[frontier_indices] + + +def _get_fig_path( + fig_dir: Path, + fig_group: tuple[tuple[str, str], ...], +) -> Path: + parts = ["PARETO"] + if fig_group: + parts.extend(f"{k}={v}" for k, v in fig_group) + filename = sanitize_filename("-".join(parts) + ".png") + return fig_dir / filename + + +def _plot_fig( + fig_dir: Path, + fig_group_data: tuple[tuple[tuple[str, str], ...], list[dict[str, object]]], + label_by: list[str], + *, + dry_run: bool, +): + fig_group, fig_data = fig_group_data + fig_path = _get_fig_path(fig_dir, fig_group) + + print("[BEGIN FIGURE]") + print(f"Group: {dict(fig_group)}") + print(f"Output file: {fig_path}") + + if dry_run: + print("[END FIGURE]") + return + + df = pd.DataFrame.from_records(fig_data) + df = df.dropna(subset=["tokens_per_user", "tokens_per_gpu"]) + + if df.empty: + print("No data points available after filtering; skipping.") + print("[END FIGURE]") + return + + frontier = _pareto_frontier(df, "tokens_per_user", "tokens_per_gpu") + frontier = frontier.sort_values("tokens_per_user") + + fig, ax = plt.subplots() + sns.scatterplot( + data=df, + x="tokens_per_user", + y="tokens_per_gpu", + color="0.5", + alpha=0.6, + ax=ax, + label="All runs", + ) + sns.lineplot( + data=frontier, + x="tokens_per_user", + y="tokens_per_gpu", + marker="o", + ax=ax, + label="Pareto frontier", + ) + + if label_by: + for _, row in frontier.iterrows(): + label_parts = [] + for key in label_by: + if key in row: + label_parts.append(f"{key}={row[key]}") + if label_parts: + ax.text( + row["tokens_per_user"], + row["tokens_per_gpu"], + "\n".join(label_parts), + fontsize=8, + ) + + ax.set_xlabel("Tokens/s/user") + ax.set_ylabel("Tokens/s/GPU") + ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.6) + fig.tight_layout() + fig.savefig(fig_path) + plt.close(fig) + + print( + f"Plotted {len(df)} points; Pareto frontier size: {len(frontier)}.", + ) + print("[END FIGURE]") + + +def plot_pareto( + output_dir: Path, + user_count_var: str | None, + gpu_count_var: str | None, + label_by: list[str], + *, + dry_run: bool, +): + fig_dir = output_dir / "pareto" + raw_data = [ + run_data + for path in output_dir.rglob("**/summary.json") + for run_data in _json_load_bytes(path) + ] + + if not raw_data: + raise ValueError(f"Did not find any parameter sweep results under {output_dir}") + + fig_dir.mkdir(parents=True, exist_ok=True) + + prepared_data, skipped_missing_users = _prepare_records( + raw_data, + user_count_var=user_count_var, + gpu_count_var=gpu_count_var, + ) + + if skipped_missing_users: + print( + f"Skipped {skipped_missing_users} runs without a user count " + "(`max_concurrency` or `max_concurrent_requests`).", + ) + + if not prepared_data: + raise ValueError( + "No data points with both throughput and user count available " + "to plot Pareto frontier.", + ) + + fig_groups = full_groupby( + prepared_data, + key=lambda item: tuple(), + ) + + with DummyExecutor() if len(fig_groups) <= 1 else ProcessPoolExecutor() as executor: + all( + executor.map( + partial( + _plot_fig, + fig_dir, + label_by=label_by, + dry_run=dry_run, + ), + fig_groups, + ) + ) + + +@dataclass +class SweepPlotParetoArgs: + output_dir: Path + user_count_var: str | None + gpu_count_var: str | None + label_by: list[str] + dry_run: bool + + parser_name: ClassVar[str] = "plot_pareto" + parser_help: ClassVar[str] = ( + "Plot Pareto frontier between tokens/s/user and tokens/s/GPU " + "from parameter sweep results." + ) + + @classmethod + def from_cli_args(cls, args: argparse.Namespace): + output_dir = Path(args.OUTPUT_DIR) + if not output_dir.exists(): + raise ValueError(f"No parameter sweep results under {output_dir}") + + label_by = [] if not args.label_by else args.label_by.split(",") + + return cls( + output_dir=output_dir, + user_count_var=args.user_count_var, + gpu_count_var=args.gpu_count_var, + label_by=label_by, + dry_run=args.dry_run, + ) + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser): + parser.add_argument( + "OUTPUT_DIR", + type=str, + default="results", + help="The directory containing the sweep results to plot.", + ) + parser.add_argument( + "--user-count-var", + type=str, + default="max_concurrency", + help="Result key that stores concurrent user count. " + "Falls back to max_concurrent_requests if missing.", + ) + parser.add_argument( + "--gpu-count-var", + type=str, + default=None, + help="Result key that stores GPU count. " + "If not provided, falls back to num_gpus/gpu_count " + "or tensor_parallel_size * pipeline_parallel_size.", + ) + parser.add_argument( + "--label-by", + type=str, + default="max_concurrency,gpu_count", + help="Comma-separated list of fields to annotate on Pareto frontier " + "points.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="If set, prints the figures to plot without drawing them.", + ) + + return parser + + +def run_main(args: SweepPlotParetoArgs): + return plot_pareto( + output_dir=args.output_dir, + user_count_var=args.user_count_var, + gpu_count_var=args.gpu_count_var, + label_by=args.label_by, + dry_run=args.dry_run, + ) + + +def main(args: argparse.Namespace): + run_main(SweepPlotParetoArgs.from_cli_args(args)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=SweepPlotParetoArgs.parser_help) + SweepPlotParetoArgs.add_cli_args(parser) + + main(parser.parse_args()) From cc0f2a0e19881c3c601d3e287f297b36d2a78f78 Mon Sep 17 00:00:00 2001 From: maang-h <55082429+maang-h@users.noreply.github.com> Date: Fri, 28 Nov 2025 16:12:20 +0800 Subject: [PATCH 08/98] [Doc] Improve abnormal information string (#29655) Signed-off-by: maang --- vllm/v1/engine/utils.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index d65cad7af03d6..24bf66c42f312 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -371,8 +371,7 @@ class CoreEngineActorManager: ) assert len(nodes) > 0, "No nodes with resources found in Ray cluster." assert dp_master_ip_key in nodes[0], ( - "The DP master node (ip: %s) is missing or dead", - dp_master_ip, + f"The DP master node (ip: {dp_master_ip}) is missing or dead" ) device_str = current_platform.ray_device_key n_node_devices: list[int] = [ @@ -446,8 +445,7 @@ class CoreEngineActorManager: if key != "node:__internal_head__" and key.startswith("node:") ] assert len(node_ip_keys) == 1, ( - "Zero or multiple node IP keys found in node resources: %s", - node_ip_keys, + f"Zero or multiple node IP keys found in node resources: {node_ip_keys}" ) node_ip_key = node_ip_keys[0] node_ip = node_ip_key.split(":")[1] @@ -464,11 +462,9 @@ class CoreEngineActorManager: if node_ip == dp_master_ip: if dp_size_available < dp_size_local: raise ValueError( - "Not enough resources to allocate %s DP ranks " - "on DP master node %s, possible to fit %s DP ranks", - dp_size_local, - dp_master_ip, - dp_size_available, + f"Not enough resources to allocate {dp_size_local} DP ranks " + f"on DP master node {dp_master_ip}, possible to fit " + f"{dp_size_available} DP ranks." ) dp_size_to_allocate = dp_size_local elif pack_strategy == "strict": From b2c1d294faca96643dbc2413d604ca160f458f0d Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Fri, 28 Nov 2025 09:44:47 +0100 Subject: [PATCH 09/98] [BUGFIX] MistralTokenizer._call__ adds an invalid EOS token (#29607) Signed-off-by: Julien Denize Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung --- tests/tokenization/test_mistral_tokenizer.py | 68 +++++++++++++++++++ vllm/transformers_utils/tokenizers/mistral.py | 20 +++++- 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py index 1ada8ee187c38..c80b698ba3848 100644 --- a/tests/tokenization/test_mistral_tokenizer.py +++ b/tests/tokenization/test_mistral_tokenizer.py @@ -331,6 +331,7 @@ class TestMistralTokenizer: ) == token_ids ) + assert mistral_tokenizer.encode_one("") == [] def test_encode(self, mistral_tokenizer: MistralTokenizer): token_ids = ( @@ -370,6 +371,51 @@ class TestMistralTokenizer: mistral_tokenizer.encode("Hello world !", add_special_tokens=False) == token_ids[1:] ) + assert mistral_tokenizer.encode("", add_special_tokens=False) == [] + + def test_call(self, mistral_tokenizer: MistralTokenizer): + token_ids = ( + [1, 22177, 4304, 2662] + if mistral_tokenizer.is_tekken + else [1, 23325, 2294, 1686] + ) + attn_mask = [1 for _ in range(len(token_ids))] + + # Test 1: default + assert mistral_tokenizer("Hello world !") == { + "attention_mask": attn_mask[1:], + "input_ids": token_ids[1:], + } + # Test 2: special tokens + assert mistral_tokenizer("Hello world !", add_special_tokens=True) == { + "attention_mask": attn_mask, + "input_ids": token_ids, + } + # Test 3: special tokens + truncation + assert mistral_tokenizer( + "Hello world !", add_special_tokens=True, truncation=True, max_length=3 + ) == { + "attention_mask": attn_mask[:-1], + "input_ids": token_ids[:-1], + } + # Test 4: special tokens + no truncation + max length + assert mistral_tokenizer( + "Hello world !", add_special_tokens=True, max_length=3 + ) == { + "attention_mask": attn_mask, + "input_ids": token_ids, + } + # Test 5: empty string + assert mistral_tokenizer("") == { + "attention_mask": [], + "input_ids": [], + } + + with pytest.raises( + ValueError, + match=(r"`text_pair` is not supported by `MistralTokenizer.__call__`."), + ): + mistral_tokenizer("Hello world !", "invalid pair") @pytest.mark.parametrize( "openai_request,add_generation_prompt,continue_final_message,expected_output,decoded_expected_output", @@ -1087,6 +1133,24 @@ class TestMistralTokenizer: ) == expected_tokens[mistral_tokenizer.is_tekken] ) + assert ( + mistral_tokenizer.decode( + ids[mistral_tokenizer.is_tekken], + skip_special_tokens=skip_special_tokens, + ) + == expected_tokens[mistral_tokenizer.is_tekken] + ) + + def test_decode_empty( + self, + mistral_tokenizer: MistralTokenizer, + ): + assert ( + mistral_tokenizer.decode( + [], + ) + == "" + ) def test_decode_int( self, @@ -1390,6 +1454,8 @@ class TestMistralTokenizer: == expected_strings[mistral_tokenizer.is_tekken] ) + assert mistral_tokenizer.convert_tokens_to_string([]) == "" + @pytest.mark.parametrize( "skip_special_tokens,tuple_expected_tokens", ( @@ -2220,3 +2286,5 @@ class TestMistralTokenizer: ids, skip_special_tokens=skip_special_tokens ) assert actual_tokens == expected_tokens + + assert mistral_tokenizer.convert_ids_to_tokens([]) == [] diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 39198a1f3d815..caff43c55ce85 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -312,13 +312,27 @@ class MistralTokenizer(TokenizerBase): truncation: bool = False, max_length: int | None = None, ): - return self.transformers_tokenizer( + if text_pair is not None: + raise ValueError( + "`text_pair` is not supported by `MistralTokenizer.__call__`." + ) + + encoded = self.transformers_tokenizer( text=text, text_pair=text_pair, add_special_tokens=add_special_tokens, truncation=truncation, max_length=max_length, ) + # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962 + # is in, revert to only call self.transformers_tokenizer(...). + # Hack to fix wrongly added eos token, when fix will be supported the condition + # below will be False even before the revert is done. + if encoded["input_ids"] and encoded["input_ids"][-1] == self.eos_token_id: + encoded["input_ids"].pop(-1) + if attention_mask := encoded.get("attention_mask"): + attention_mask.pop(-1) + return encoded @property def vocab(self) -> list[str]: @@ -349,6 +363,8 @@ class MistralTokenizer(TokenizerBase): max_length: int | None = None, add_special_tokens: bool | None = None, ) -> list[int]: + # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962 + # is in, directly call self.transformers_tokenizer.encode(...). encoded = self.tokenizer.encode( text, bos=add_special_tokens is not False, eos=False ) @@ -387,6 +403,8 @@ class MistralTokenizer(TokenizerBase): ) def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str: + # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962 + # is in, directly call self.transformers_tokenizer.decode(...). if isinstance(ids, int): ids = [ids] From 5f5521bd5d7d38d380640166294d97a839cf7ef9 Mon Sep 17 00:00:00 2001 From: Filipp Fisin <48059208+qGentry@users.noreply.github.com> Date: Fri, 28 Nov 2025 09:45:10 +0100 Subject: [PATCH 10/98] Fix parameter order in GPT-OSS weight loading function for non-MXFP4 weights (#29506) Signed-off-by: Filipp Fisin <48059208+qGentry@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/model_executor/models/gpt_oss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 9de3e261941b1..cff16b7a7a8cd 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -647,8 +647,8 @@ class GptOssModel(nn.Module): ) else: return self._load_weights_other( - ep_rank_start, ep_rank_end, + ep_rank_start, heads_per_rank, head_start, weights, From ccbdf51bd57761a7a7e7a5adf685fcec67c9c1bd Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 28 Nov 2025 17:19:25 +0800 Subject: [PATCH 11/98] [Doc] Reorganize benchmark docs (#29658) Signed-off-by: DarkLight1337 --- docs/.nav.yml | 5 + docs/benchmarking/README.md | 7 + .../benchmarks.md => benchmarking/cli.md} | 335 +++--------------- docs/benchmarking/dashboard.md | 58 +++ docs/benchmarking/sweeps.md | 178 ++++++++++ 5 files changed, 291 insertions(+), 292 deletions(-) create mode 100644 docs/benchmarking/README.md rename docs/{contributing/benchmarks.md => benchmarking/cli.md} (71%) create mode 100644 docs/benchmarking/dashboard.md create mode 100644 docs/benchmarking/sweeps.md diff --git a/docs/.nav.yml b/docs/.nav.yml index c8bf00efb2370..d30c0f12eba4c 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -52,6 +52,11 @@ nav: - Plugins: - design/*plugin*.md - design/* + - Benchmarking: + - benchmarking/README.md + - benchmarking/cli.md + - benchmarking/sweeps.md + - benchmarking/dashboard.md - API Reference: - api/README.md - api/vllm diff --git a/docs/benchmarking/README.md b/docs/benchmarking/README.md new file mode 100644 index 0000000000000..238290d4762b3 --- /dev/null +++ b/docs/benchmarking/README.md @@ -0,0 +1,7 @@ +# Benchmark Suites + +vLLM provides comprehensive benchmarking tools for performance testing and evaluation: + +- **[Benchmark CLI](./cli.md)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing. +- **[Parameter Sweeps](./sweeps.md)**: Automate `vllm bench` runs for multiple configurations, useful for [optimization and tuning](../configuration/optimization.md). +- **[Performance Dashboard](./dashboard.md)**: Automated CI that publishes benchmarks on each commit. diff --git a/docs/contributing/benchmarks.md b/docs/benchmarking/cli.md similarity index 71% rename from docs/contributing/benchmarks.md rename to docs/benchmarking/cli.md index e4714e6266381..44a4c40125952 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/benchmarking/cli.md @@ -1,22 +1,10 @@ ---- -toc_depth: 4 ---- +# Benchmark CLI -# Benchmark Suites +This section guides you through running benchmark tests with the extensive datasets supported on vLLM. -vLLM provides comprehensive benchmarking tools for performance testing and evaluation: +It's a living document, updated as new features and datasets become available. -- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing -- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations -- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development - -## Benchmark CLI - -This section guides you through running benchmark tests with the extensive -datasets supported on vLLM. It's a living document, updated as new features and datasets -become available. - -### Dataset Overview +## Dataset Overview