From 1501a4070ec1f5e3f8f1ffd0099dc32d85a9ad98 Mon Sep 17 00:00:00 2001 From: Jeffrey Wang Date: Sat, 20 Dec 2025 02:29:31 -0800 Subject: [PATCH 1/8] [Bugfix] Read truncate_prompt_tokens from pooling_params in AsyncLLM.encode() (#31013) Signed-off-by: Jeffrey Wang --- vllm/engine/protocol.py | 6 +++++- vllm/v1/engine/async_llm.py | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index d94951a0cffc8..bf656cf23de65 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -71,7 +71,11 @@ class EngineClient(ABC): truncate_prompt_tokens: int | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> AsyncGenerator[PoolingRequestOutput, None]: - """Generate outputs for a request from a pooling model.""" + """Generate outputs for a request from a pooling model. + + NOTE: truncate_prompt_tokens is deprecated in v0.14. + TODO: Remove this argument in v0.15. + """ ... @abstractmethod diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a6ee241c41151..1cbe4718f2e5c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -4,6 +4,7 @@ import asyncio import os import socket import time +import warnings from collections.abc import AsyncGenerator, Iterable, Mapping from copy import copy from typing import Any, cast @@ -627,6 +628,9 @@ class AsyncLLM(EngineClient): The caller of generate() iterates the returned AsyncGenerator, returning the RequestOutput back to the caller. + + NOTE: truncate_prompt_tokens is deprecated in v0.14. + TODO: Remove truncate_prompt_tokens in v0.15. """ try: @@ -641,9 +645,19 @@ class AsyncLLM(EngineClient): if tokenization_kwargs is None: tokenization_kwargs = {} + + if truncate_prompt_tokens is not None: + warnings.warn( + "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` " + "is deprecated and will be removed in v0.15. " + "Please use `pooling_params.truncate_prompt_tokens` instead.", + DeprecationWarning, + stacklevel=2, + ) + _validate_truncation_size( self.model_config.max_model_len, - truncate_prompt_tokens, + pooling_params.truncate_prompt_tokens, tokenization_kwargs, ) From 560ae9638c00e761d6f1ca33882249dbfebbe8aa Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Sat, 20 Dec 2025 21:45:27 +0800 Subject: [PATCH 2/8] [XPU] enable fp8 online streaming quantization (#30944) Signed-off-by: Yan Ma --- .../model_executor/layers/quantization/fp8.py | 14 +- .../layers/quantization/ipex_quant.py | 122 +++--------------- 2 files changed, 29 insertions(+), 107 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index a86fb3d309525..30ca64238ae94 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -124,11 +124,13 @@ def get_fp8_moe_backend( block_quant: bool, moe_parallel_config: FusedMoEParallelConfig, with_lora_support: bool, -) -> Fp8MoeBackend: +) -> Fp8MoeBackend | None: """ Select the primary FP8 MoE backend Note: Shape-specific fallbacks may still occur at runtime. """ + if current_platform.is_xpu(): + return None if with_lora_support: return Fp8MoeBackend.TRITON # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100. @@ -292,6 +294,13 @@ class Fp8Config(QuantizationConfig): return UnquantizedLinearMethod() return XPUFp8LinearMethod(fp8_config) elif isinstance(layer, FusedMoE): + if is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedFusedMoEMethod(layer.moe_config) + return XPUFp8MoEMethod(fp8_config, layer) elif isinstance(layer, Attention): return Fp8KVCacheMethod(self) @@ -1107,7 +1116,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> mk.FusedMoEPrepareAndFinalize | None: if ( - self.rocm_aiter_moe_enabled + current_platform.is_xpu() + or self.rocm_aiter_moe_enabled or self.use_marlin or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index f33ee43727f19..9de2924ec71b1 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -6,13 +6,8 @@ from typing import Any, Optional import torch from packaging import version from torch.nn import Module -from torch.nn.parameter import Parameter from vllm._ipex_ops import ipex_ops as ops -from vllm.model_executor.layers.fused_moe import ( - FusedMoEMethodBase, - FusedMoeWeightScaleSupported, -) from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.linear import ( LinearBase, @@ -24,14 +19,14 @@ from vllm.model_executor.layers.quantization import ( QuantizationMethods, ) from vllm.model_executor.layers.quantization.awq import AWQLinearMethod -from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod +from vllm.model_executor.layers.quantization.fp8 import ( + Fp8Config, + Fp8LinearMethod, + Fp8OnlineMoEMethod, +) from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped -from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - maybe_create_device_identity, -) -from vllm.model_executor.parameter import ModelWeightParameter -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform MIN_IPEX_VERSION = "2.6.0" @@ -309,44 +304,15 @@ class XPUFp8LinearMethod(Fp8LinearMethod): def __init__(self, quant_config: Fp8Config): super().__init__(quant_config) - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - maybe_create_device_identity() - - output_size_per_partition = sum(output_partition_sizes) - weight_loader = extra_weight_attrs.get("weight_loader") - layer.logical_widths = output_partition_sizes - layer.input_size_per_partition = input_size_per_partition - layer.output_size_per_partition = output_size_per_partition - layer.orig_dtype = params_dtype - layer.weight_block_size = None - weight = ModelWeightParameter( - data=torch.empty( - output_size_per_partition, - input_size_per_partition, - dtype=params_dtype, - ), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight", weight) - def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return # If checkpoint not serialized fp8, quantize the weights. if not self.quant_config.is_checkpoint_fp8_serialized: qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) # Update the layer with the new values. - layer.weight = Parameter(qweight, requires_grad=False) - layer.weight_scale = Parameter(weight_scale, requires_grad=False) + replace_parameter(layer, "weight", qweight.data) + replace_parameter(layer, "weight_scale", weight_scale.data) layer.input_scale = None def apply( @@ -363,69 +329,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod): return output -class XPUFp8MoEMethod(FusedMoEMethodBase): +class XPUFp8MoEMethod(Fp8OnlineMoEMethod): def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): - super().__init__(layer.moe_config) + super().__init__(quant_config, layer) self.quant_config = quant_config - def create_weights( - self, - layer: Module, - num_experts: int, - hidden_size: int, - intermediate_size_per_partition: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - layer.intermediate_size_per_partition = intermediate_size_per_partition - layer.hidden_size = hidden_size - layer.num_experts = num_experts - layer.orig_dtype = params_dtype - layer.weight_block_size = None - # WEIGHTS - w13_weight = torch.nn.Parameter( - torch.empty( - num_experts, - 2 * intermediate_size_per_partition, - hidden_size, - dtype=params_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w13_weight", w13_weight) - set_weight_attrs(w13_weight, extra_weight_attrs) - - w2_weight = torch.nn.Parameter( - torch.empty( - num_experts, - hidden_size, - intermediate_size_per_partition, - dtype=params_dtype, - ), - requires_grad=False, - ) - layer.register_parameter("w2_weight", w2_weight) - set_weight_attrs(w2_weight, extra_weight_attrs) - - # Allocate 2 scales for w1 and w3 respectively. - # They will be combined to a single scale after weight loading. - w13_weight_scale = torch.nn.Parameter( - torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False - ) - w2_weight_scale = torch.nn.Parameter( - torch.ones(num_experts, dtype=torch.float32), requires_grad=False - ) - layer.register_parameter("w13_weight_scale", w13_weight_scale) - layer.register_parameter("w2_weight_scale", w2_weight_scale) - - extra_weight_attrs.update( - {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value} - ) - # INPUT_SCALES - layer.w13_input_scale = None - layer.w2_input_scale = None - def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return if not self.quant_config.is_checkpoint_fp8_serialized: fp8_dtype = current_platform.fp8_dtype() w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype) @@ -448,8 +359,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase): w2_weight[expert, :, :], layer.w2_weight_scale[expert] = ( ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :]) ) - layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + replace_parameter(layer, "w13_weight", w13_weight) + replace_parameter(layer, "w2_weight", w2_weight) + import intel_extension_for_pytorch as ipex ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts From 54c892438479c0a8aec9a10b8db570034af92443 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com> Date: Sat, 20 Dec 2025 13:22:04 -0500 Subject: [PATCH 3/8] [MoE Refactor][5/N] Isolate zero expert to LongCatFlash (#28891) Signed-off-by: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com> Signed-off-by: Dongjie Zou <85092850+baonudesifeizhai@users.noreply.github.com> Signed-off-by: baonudesifeizhai Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw Co-authored-by: Robert Shaw --- tests/test_routing_simulator.py | 2 +- .../layers/fused_moe/__init__.py | 4 + .../fused_moe/fused_moe_modular_method.py | 10 +- vllm/model_executor/layers/fused_moe/layer.py | 53 +---- .../fused_moe/unquantized_fused_moe_method.py | 10 +- .../layers/fused_moe/zero_expert_fused_moe.py | 189 ++++++++++++++++++ .../layers/quantization/awq_marlin.py | 2 +- .../layers/quantization/bitsandbytes.py | 2 +- .../compressed_tensors_moe.py | 12 +- .../layers/quantization/experts_int8.py | 2 +- .../model_executor/layers/quantization/fp8.py | 12 +- .../layers/quantization/gguf.py | 2 +- .../layers/quantization/gptq_marlin.py | 2 +- .../layers/quantization/modelopt.py | 4 +- .../layers/quantization/moe_wna16.py | 2 +- .../layers/quantization/mxfp4.py | 4 +- .../layers/quantization/quark/quark_moe.py | 6 +- .../model_executor/layers/quantization/rtn.py | 2 +- vllm/model_executor/models/longcat_flash.py | 53 +++-- 19 files changed, 264 insertions(+), 109 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py index e8826eb441a24..44cbdeed45074 100644 --- a/tests/test_routing_simulator.py +++ b/tests/test_routing_simulator.py @@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device): envs.environment_variables[env_name] = lambda s=strategy: s # Test the select_experts method - topk_weights, topk_ids, _ = fused_moe.select_experts( + topk_weights, topk_ids = fused_moe.select_experts( hidden_states=hidden_states, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 8fee4038b60b8..3d248e7fb9945 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) from vllm.model_executor.layers.fused_moe.utils import activation_without_mul +from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import ( + ZeroExpertFusedMoE, +) from vllm.triton_utils import HAS_TRITON _config: dict[str, Any] | None = None @@ -54,6 +57,7 @@ __all__ = [ "FusedMoEPrepareAndFinalize", "RoutingMethodType", "SharedFusedMoE", + "ZeroExpertFusedMoE", "activation_without_mul", "override_config", "get_config", diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 9c9bc2514bb4b..30ff1bf2f008a 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -92,7 +92,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, zero_expert_result = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -110,10 +110,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): expert_map=None if self.disable_expert_map else layer.expert_map, ) - if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: - assert not isinstance(result, tuple), ( - "Shared + zero experts are mutually exclusive not yet supported" - ) - return result, zero_expert_result - else: - return result + return result diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 6a65b06014bca..2e7267d56d838 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -32,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, RoutingMethodType, ) -from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( init_aiter_topK_meta_data, ) @@ -350,8 +349,6 @@ class FusedMoE(CustomOp): num_redundant_experts: int = 0, has_bias: bool = False, is_sequence_parallel=False, - zero_expert_num: int | None = 0, - zero_expert_type: str | None = None, expert_mapping: list[tuple[str, str, int, str]] | None = None, n_shared_experts: int | None = None, routing_method_type: int | None = None, @@ -409,8 +406,6 @@ class FusedMoE(CustomOp): self.global_num_experts = num_experts + num_redundant_experts self.logical_num_experts = num_experts - self.zero_expert_num = zero_expert_num - self.zero_expert_type = zero_expert_type # Expert mapping used in self.load_weights self.expert_mapping = expert_mapping @@ -1525,15 +1520,15 @@ class FusedMoE(CustomOp): self, hidden_states: torch.Tensor, router_logits: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: + ) -> tuple[torch.Tensor, torch.Tensor]: """ Route the input hidden states to the top-k experts based on the router logits. Returns: - (topk_weights, topk_ids, zero_expert_result) - (tuple[torch.Tensor, torch.Tensor, torch.Tensor]): - The weights, expert ids, and zero expert computation result. + (topk_weights, topk_ids) + (tuple[torch.Tensor, torch.Tensor]): + The weights and expert ids. **Compatibility**: When EPLB is not enabled, the returned ids are equivalent to global logical ids, so should be compatible with @@ -1655,23 +1650,7 @@ class FusedMoE(CustomOp): assert topk_ids.dtype == indices_type or indices_type is None - # Compute zero expert result if needed - if ( - self.zero_expert_num is not None - and self.zero_expert_num > 0 - and self.zero_expert_type is not None - and self.global_num_experts is not None - ): - zero_expert_result = zero_experts_compute_triton( - expert_indices=topk_ids, - expert_scales=topk_weights, - num_experts=self.global_num_experts, - zero_expert_type=self.zero_expert_type, - hidden_states=hidden_states, - ) - else: - zero_expert_result = None - return topk_weights, topk_ids, zero_expert_result + return topk_weights, topk_ids def must_reduce_shared_expert_outputs(self) -> bool: """ @@ -1736,14 +1715,7 @@ class FusedMoE(CustomOp): fused_output = torch.ops.vllm.moe_forward( hidden_states, router_logits, self.layer_name ) - if self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(fused_output, tuple) - fused_output, zero_expert_result = fused_output - return (reduce_output(fused_output) + zero_expert_result)[ - ..., :og_hidden_states - ] - else: - return reduce_output(fused_output)[..., :og_hidden_states] + return reduce_output(fused_output)[..., :og_hidden_states] else: if current_platform.is_tpu() or current_platform.is_cpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we @@ -1841,13 +1813,6 @@ class FusedMoE(CustomOp): final_hidden_states, ) - if self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(final_hidden_states, tuple) - assert self.shared_experts is None - final_hidden_states, zero_expert_result = final_hidden_states - if zero_expert_result is not None: - final_hidden_states += zero_expert_result - if not skip_result_store: if self.shared_experts is None: full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( @@ -2030,9 +1995,6 @@ class FusedMoE(CustomOp): shared_output, final_hidden_states, ) - elif self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(final_hidden_states, tuple) - final_hidden_states, zero_expert_result = final_hidden_states def combine_output(states: torch.Tensor) -> torch.Tensor: if do_naive_dispatch_combine: @@ -2051,9 +2013,6 @@ class FusedMoE(CustomOp): final_hidden_states[0], combine_output(final_hidden_states[1]), ) - elif self.zero_expert_num is not None and self.zero_expert_num > 0: - assert isinstance(final_hidden_states, torch.Tensor) - return (combine_output(final_hidden_states), zero_expert_result) else: return combine_output(final_hidden_states) diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 6182f10aa70f0..1ee7b65b22e3f 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -295,7 +295,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, zero_expert_result = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -336,13 +336,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_map=layer.expert_map, ) - if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: - assert not isinstance(result, tuple), ( - "Shared + zero experts are mutually exclusive not yet supported" - ) - return result, zero_expert_result - else: - return result + return result def forward_cpu( self, diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py new file mode 100644 index 0000000000000..97d21767f4fc3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from contextlib import contextmanager + +import torch +from torch import nn + +from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton +from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + +class ZeroExpertFusedMoE(FusedMoE): + """ + A FusedMoE operation that also computes the results of zero experts. + Zero experts perform identity operations (scaled pass-through) instead + of full MLP computations. + + This class uses memoization to avoid redundant routing computation: + routing is computed once and reused for both zero expert computation + and the main FusedMoE forward pass. + """ + + def __init__( + self, + zero_expert_num: int, + zero_expert_type: str, + router: nn.Module, + **kwargs, + ): + # ZeroExpertFusedMoE manages its own custom_routing_function for memoization + assert ( + "custom_routing_function" not in kwargs + or kwargs.get("custom_routing_function") is None + ), ( + "ZeroExpertFusedMoE does not support external custom_routing_function. " + "It manages its own for routing memoization." + ) + + # Automatically slice router's e_score_correction_bias to only include + # real experts (not zero_experts) for the base FusedMoE. + # The full bias will be used temporarily in forward() for routing. + if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs: + num_real_experts = kwargs["num_experts"] + router_bias = router.e_score_correction_bias + user_bias = kwargs.get("e_score_correction_bias") + + # Use router's bias if: + # 1. User didn't provide bias, or + # 2. User provided full bias (same size as router) + if user_bias is None or user_bias.shape[0] == router_bias.shape[0]: + kwargs["e_score_correction_bias"] = router_bias[:num_real_experts] + + # FusedMoE no longer accepts zero_expert_num/zero_expert_type. + # We handle zero experts ourselves in forward(). + super().__init__(**kwargs) + # Store the actual zero_expert_num and zero_expert_type for our own use + self._actual_zero_expert_num = zero_expert_num + self._actual_zero_expert_type = zero_expert_type + self._router = router # Full router (includes zero experts) + + # Expose zero_expert_num and zero_expert_type as attributes for + # compatibility with quantization methods that check these attributes + self.zero_expert_num = 0 + self.zero_expert_type = None + + # Memoization state for routing results + self._memoized_topk_weights: torch.Tensor | None = None + self._memoized_topk_ids: torch.Tensor | None = None + + # Create custom_routing_function to reuse memoized routing results + def custom_routing_function(hidden_states, gating_output, topk, renormalize): + """Return memoized `topk_weights` and `topk_ids`.""" + if self._memoized_topk_weights is None or self._memoized_topk_ids is None: + raise RuntimeError( + "ZeroExpertFusedMoE: routing results not memoized. " + "Call select_experts first to compute routing." + ) + return self._memoized_topk_weights, self._memoized_topk_ids + + self.custom_routing_function = custom_routing_function + + @contextmanager + def _temporarily_set_attrs(self, **attrs): + """ + Temporarily set attributes using object.__setattr__ and restore them. + + This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues. + When PyTorch Dynamo traces the forward pass, it cannot handle + nn.Module.__setattr__ calls (which include parameter registration logic), + resulting in "Unsupported" errors. Using object.__setattr__ directly + sets the attribute without triggering nn.Module's custom __setattr__, + allowing Dynamo to trace the code successfully. + """ + originals = {key: getattr(self, key) for key in attrs} + try: + for key, value in attrs.items(): + object.__setattr__(self, key, value) + yield + finally: + for key, value in originals.items(): + object.__setattr__(self, key, value) + + def _compute_zero_expert_result( + self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + ) -> torch.Tensor | None: + """Compute zero expert results using pre-computed routing.""" + if ( + self._actual_zero_expert_num is None + or self._actual_zero_expert_num <= 0 + or self._actual_zero_expert_type is None + ): + return None + + return zero_experts_compute_triton( + expert_indices=topk_ids.clone(), + expert_scales=topk_weights.clone(), + num_experts=self.logical_num_experts, + zero_expert_type=self._actual_zero_expert_type, + hidden_states=hidden_states, + ) + + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, # Full logits including zero experts + ) -> torch.Tensor: + """ + Forward pass with zero expert support and routing memoization. + + Args: + hidden_states: Input hidden states + router_logits: Full router logits (including zero experts) + + Returns: + Combined output from real experts and zero experts + """ + # Prepare temporary attribute overrides for routing computation + temp_attrs = { + "custom_routing_function": None, # Disable for first routing + } + if self._router is not None: + temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias + + # Compute routing with temporary attributes + # Pass full router_logits (including zero experts) so that zero experts + # can be properly identified in topk_ids + with self._temporarily_set_attrs(**temp_attrs): + topk_weights, topk_ids = self.select_experts( + hidden_states=hidden_states, + router_logits=router_logits, # Full logits (includes zero experts) + ) + + # Compute zero expert result if needed + zero_expert_result = self._compute_zero_expert_result( + hidden_states=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + ) + + # Memoize routing results for reuse in super().forward() + self._memoized_topk_weights = topk_weights + self._memoized_topk_ids = topk_ids + + # Slice router_logits for real experts only + router_logits_sliced = router_logits[..., : self.logical_num_experts] + + # Compute real expert results (will reuse memoized routing via + # custom_routing_function) + # zero_expert_num is already 0, so FusedMoE won't handle zero experts + fused_out = super().forward( + hidden_states=hidden_states, + router_logits=router_logits_sliced, + ) + + # Combine results + # Both zero_expert_result and fused_out are computed from the same + # hidden_states, so they should be on the same device. + if zero_expert_result is not None: + fused_out = fused_out + zero_expert_result + + # Clear memoization after use + self._memoized_topk_weights = None + self._memoized_topk_ids = None + + return fused_out diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 314848721a80a..602d02d2f15a4 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -764,7 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert layer.activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 1fd959cb3423d..efe5677045e4b 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -500,7 +500,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index fc359a3067a9c..f4038801c266b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -574,7 +574,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): e_score_correction_bias=layer.e_score_correction_bias, ) - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1166,7 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1403,7 +1403,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1765,7 +1765,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): f"{layer.activation} not supported for Marlin MoE." ) - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1991,7 +1991,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -2607,7 +2607,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet." ) assert self.moe_quant_config is not None - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 11097cf36f5ca..56b11b22f7ff5 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -142,7 +142,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 30ca64238ae94..4b2438133dd6b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1292,13 +1292,11 @@ class Fp8MoEMethod(FusedMoEMethodBase): apply_router_weight_on_input=layer.apply_router_weight_on_input, ) - select_result = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) - topk_weights, topk_ids, zero_expert_result = select_result - if self.rocm_aiter_moe_enabled: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 rocm_aiter_fused_experts, @@ -1353,13 +1351,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): apply_router_weight_on_input=layer.apply_router_weight_on_input, ) - if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: - assert not isinstance(result, tuple), ( - "Shared + zero experts are mutually exclusive not yet supported" - ) - return result, zero_expert_result - else: - return result + return result class Fp8OnlineMoEMethod(Fp8MoEMethod): diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 9dd734f2fea6a..9600bb42295dc 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -639,7 +639,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): "fused GGUF MoE method." ) - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 347c7b2008d12..d2dafca99a230 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -900,7 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert layer.activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index aa3937d4c03ff..54e8673fcfbb8 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -796,7 +796,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) # Expert selection - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -1599,7 +1599,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): x_routing, _ = x else: x_routing = x - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x_routing, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 4bedb951a33f5..513f6f7b21abc 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -370,7 +370,7 @@ class MoeWNA16Method(FusedMoEMethodBase): from vllm.model_executor.layers.fused_moe import fused_experts assert layer.activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 832925825c453..dc0fbfa7df35a 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -896,7 +896,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): raise NotImplementedError("EPLB is not supported for mxfp4") if self.mxfp4_backend == Mxfp4Backend.MARLIN: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -990,7 +990,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): ): from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 0b9b098afb1f6..81970480319ab 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -338,7 +338,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -530,7 +530,7 @@ class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) @@ -738,7 +738,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index b2ecb0b175f81..dce9c661ec332 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -359,7 +359,7 @@ class RTNMoEMethod(FusedMoEMethodBase): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - topk_weights, topk_ids, _ = layer.select_experts( + topk_weights, topk_ids = layer.select_experts( hidden_states=x, router_logits=router_logits, ) diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index c5441283f9711..774737387639b 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -46,7 +46,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe import FusedMoE, ZeroExpertFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -179,7 +179,7 @@ class FlashConfig(PretrainedConfig): self.intermediate_size = ( self.ffn_hidden_size if hasattr(self, "ffn_hidden_size") - else self.intermediate_size + else intermediate_size ) if hasattr(self, "moe_intermediate_size"): self.moe_intermediate_size = self.moe_intermediate_size @@ -280,10 +280,6 @@ class LongcatMoe(nn.Module): ): super().__init__() self.hidden_size = hidden_size - self.zero_expert_num = config.zero_expert_num - self.zero_expert_type = config.zero_expert_type - self.routed_scaling_factor = config.routed_scaling_factor - self.enable_eplb = enable_eplb # Gate always runs at half / full precision for now. self.rounter_params_dtype = params_dtype if config.router_dtype == "float32": @@ -291,25 +287,27 @@ class LongcatMoe(nn.Module): self.router = LongcatRouter( config=config, - zero_expert_num=self.zero_expert_num, + zero_expert_num=config.zero_expert_num, rounter_params_dtype=self.rounter_params_dtype, prefix=f"{prefix}.gate", ) - self.experts = FusedMoE( + assert config.zero_expert_num is not None + assert config.zero_expert_type is not None + self.experts = ZeroExpertFusedMoE( + zero_expert_num=config.zero_expert_num, + zero_expert_type=config.zero_expert_type, + router=self.router, num_experts=num_experts, top_k=top_k, hidden_size=hidden_size, intermediate_size=intermediate_size, reduce_results=True, params_dtype=params_dtype, - e_score_correction_bias=self.router.e_score_correction_bias, renormalize=False, quant_config=quant_config, prefix=f"{prefix}.experts", - zero_expert_num=self.zero_expert_num, - zero_expert_type=self.zero_expert_type, - enable_eplb=self.enable_eplb, + enable_eplb=enable_eplb, routed_scaling_factor=config.routed_scaling_factor, ) @@ -317,11 +315,34 @@ class LongcatMoe(nn.Module): num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) - router_logits = self.router(hidden_states.to(self.rounter_params_dtype)) - final_hidden_states = self.experts( - hidden_states=hidden_states, router_logits=router_logits + # Align to FusedMoE padded hidden size to avoid dim mismatch + padded_hidden = self.experts.hidden_size + if hidden_dim < padded_hidden: + hidden_states_padded = torch.nn.functional.pad( + hidden_states, + (0, padded_hidden - hidden_dim), + mode="constant", + value=0.0, + ) + else: + hidden_states_padded = hidden_states + + router_logits_full = self.router( + hidden_states_padded.to(self.rounter_params_dtype) ) + # ZeroExpertFusedMoE handles routing memoization and zero expert computation + # internally. Pass full router_logits (including zero experts) so that + # zero experts can be properly identified in routing. + final_hidden_states = self.experts( + hidden_states=hidden_states_padded, + router_logits=router_logits_full, # Full logits (includes zero experts) + ) + + # Crop back to original hidden dimension if padded earlier + if padded_hidden != hidden_dim: + final_hidden_states = final_hidden_states[..., :hidden_dim] + return final_hidden_states.view(num_tokens, hidden_dim) @@ -419,6 +440,7 @@ class FlashDecoderLayer(nn.Module): hidden_states = self.self_attn[0]( positions=positions, hidden_states=hidden_states, + llama_4_scaling=None, ) hidden_states, residual = self.post_attention_layernorm[0]( @@ -438,6 +460,7 @@ class FlashDecoderLayer(nn.Module): hidden_states = self.self_attn[1]( positions=positions, hidden_states=hidden_states, + llama_4_scaling=None, ) hidden_states, residual = self.post_attention_layernorm[1]( hidden_states, residual From ee52d9901d368da35a80048c299ccb53e78a9a01 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Sun, 21 Dec 2025 04:02:57 +0800 Subject: [PATCH 4/8] [Quantization] support logical_widths for fp8 marlin (#30962) Signed-off-by: Jinzhen Lin Signed-off-by: Jinzhen Lin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../layers/quantization/utils/marlin_utils_fp8.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 93d238a0524d8..1fb5223b07d76 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -142,10 +142,20 @@ def prepare_fp8_layer_for_marlin( # marlin kernel only support channel-wise and group-wise quantization # we need to convert the scales if weight_block_size is None: + logical_widths = getattr(layer, "logical_widths", []) if scales.nelement() == 1: # tensor-wise quantization -> channel-wise quantization # (1, 1) =>(repeat)=> (1, size_n) scales = scales.view(1, 1).repeat_interleave(part_size_n, 1) + elif scales.nelement() == len(logical_widths): + # tensor-wise quantization with logical_widths -> + # channel-wise quantization + assert sum(logical_widths) == part_size_n, ( + f"Sum of logical_widths ({sum(logical_widths)}) must be equal " + f"to part_size_n ({part_size_n})" + ) + lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64) + scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1) elif scales.nelement() > 1 and scales.nelement() != part_size_n: assert part_size_n % scales.nelement() == 0 s_size = scales.nelement() From ae0770fa6bcc58892692f0d1b2a3f0a300dbe82a Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sat, 20 Dec 2025 16:48:49 -0500 Subject: [PATCH 5/8] [CI] Fix H200 Distributed test (#31054) Signed-off-by: Lucas Wilkinson --- .buildkite/test-amd.yaml | 6 +- .buildkite/test-pipeline.yaml | 6 +- .buildkite/test_areas/distributed.yaml | 4 +- examples/offline_inference/data_parallel.py | 164 +++++--------------- 4 files changed, 51 insertions(+), 129 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 9a770869b1d17..bd00c47df8cb0 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1254,13 +1254,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 @@ -1508,7 +1508,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f644504a5b937..3c823fc872b05 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1109,13 +1109,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 @@ -1334,7 +1334,7 @@ steps: - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 52d57c99fcfb5..1a3739cc2417a 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -145,7 +145,7 @@ steps: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py - label: Distributed Tests (2 GPUs)(B200) @@ -171,7 +171,7 @@ steps: - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" - label: Distributed NixlConnector PD accuracy (4 GPUs) timeout_in_minutes: 30 diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index be0b846995a92..bcf1ba307eff1 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -5,25 +5,25 @@ Usage: Single node: python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 + -dp=2 \ + -tp=2 Multi-node: Node 0 (assume the node has ip of 10.99.48.128): python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 \ - --node-size=2 \ + -dp=2 \ + -tp=2 \ + --nnodes=2 \ --node-rank=0 \ --master-addr=10.99.48.128 \ --master-port=13345 Node 1: python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ - --dp-size=2 \ - --tp-size=2 \ - --node-size=2 \ + -dp=2 \ + -tp=2 \ + --nnodes=2 \ --node-rank=1 \ --master-addr=10.99.48.128 \ --master-port=13345 @@ -32,103 +32,40 @@ Multi-node: import os from time import sleep -from vllm import LLM, SamplingParams +from vllm import LLM, EngineArgs, SamplingParams from vllm.platforms import current_platform +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.network_utils import get_open_port -def parse_args(): - import argparse +def create_parser(): + parser = FlexibleArgumentParser(description="Data Parallel Inference") - parser = argparse.ArgumentParser(description="Data Parallel Inference") - parser.add_argument( - "--model", - type=str, - default="ibm-research/PowerMoE-3b", - help="Model name or path", - ) - parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size") - parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size") - parser.add_argument( - "--node-size", type=int, default=1, help="Total number of nodes" - ) - parser.add_argument( - "--node-rank", type=int, default=0, help="Rank of the current node" - ) - parser.add_argument( - "--master-addr", type=str, default="", help="Master node IP address" - ) - parser.add_argument("--master-port", type=int, default=0, help="Master node port") - parser.add_argument( - "--enforce-eager", action="store_true", help="Enforce eager mode execution." - ) - parser.add_argument( - "--trust-remote-code", action="store_true", help="Trust remote code." - ) - parser.add_argument( - "--max-num-seqs", - type=int, - default=64, - help=("Maximum number of sequences to be processed in a single iteration."), - ) - parser.add_argument( - "--max-model-len", - type=int, - help=("Maximum number of tokens to be processed in a single iteration."), + # Add all engine args + EngineArgs.add_cli_args(parser) + parser.set_defaults( + model="ibm-research/PowerMoE-3b", + enable_expert_parallel=True, ) + + # Add timeout (not in EngineArgs) parser.add_argument( "--timeout", type=int, default=300, - help=("Number of seconds before unresponsive process is killed."), + help="Number of seconds before unresponsive process is killed.", ) - parser.add_argument( - "--gpu-memory-utilization", - type=float, - default=0.8, - help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."), - ) - parser.add_argument( - "--enable-dbo", - action="store_true", - help=("Enable microbatched execution"), - ) - parser.add_argument( - "--compilation-config", - type=int, - help=("Compilation optimization (O) mode 0-3."), - ) - parser.add_argument( - "--quantization", - type=str, - ) - parser.add_argument( - "--disable-expert-parallel", - dest="enable_expert_parallel", - action="store_false", - help="Disable expert parallel (default: enabled).", - ) - parser.set_defaults(enable_expert_parallel=True) - return parser.parse_args() + + return parser def main( - model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, dp_master_port, - GPUs_per_dp_rank, - enforce_eager, - enable_expert_parallel, - trust_remote_code, - max_num_seqs, - max_model_len, - compilation_config, - gpu_memory_utilization, - enable_dbo, - quantization, + engine_args, ): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) @@ -173,19 +110,7 @@ def main( ) # Create an LLM. - llm = LLM( - model=model, - tensor_parallel_size=GPUs_per_dp_rank, - enforce_eager=enforce_eager, - enable_expert_parallel=enable_expert_parallel, - trust_remote_code=trust_remote_code, - max_num_seqs=max_num_seqs, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enable_dbo=enable_dbo, - quantization=quantization, - compilation_config=compilation_config, - ) + llm = LLM(**engine_args) outputs = llm.generate(prompts, sampling_params) # Print the outputs. for i, output in enumerate(outputs): @@ -204,22 +129,29 @@ def main( if __name__ == "__main__": - args = parse_args() + parser = create_parser() + args = vars(parser.parse_args()) - dp_size = args.dp_size - tp_size = args.tp_size - node_size = args.node_size - node_rank = args.node_rank + # Extract DP-specific args + dp_size = args.pop("data_parallel_size") + nnodes = args.get("nnodes", 1) + node_rank = args.get("node_rank", 0) + master_addr = args.get("master_addr", "") + master_port = args.get("master_port", 0) + timeout = args.pop("timeout") - if node_size == 1: + # Remaining args are engine args + engine_args = args + + if nnodes == 1: dp_master_ip = "127.0.0.1" dp_master_port = get_open_port() else: - dp_master_ip = args.master_addr - dp_master_port = args.master_port + dp_master_ip = master_addr + dp_master_port = master_port - assert dp_size % node_size == 0, "dp_size should be divisible by node_size" - dp_per_node = dp_size // node_size + assert dp_size % nnodes == 0, "dp_size should be divisible by nnodes" + dp_per_node = dp_size // nnodes from multiprocessing import Process @@ -235,29 +167,19 @@ if __name__ == "__main__": proc = Process( target=main, args=( - args.model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, dp_master_port, - tp_size, - args.enforce_eager, - args.enable_expert_parallel, - args.trust_remote_code, - args.max_num_seqs, - args.max_model_len, - args.compilation_config, - args.gpu_memory_utilization, - args.enable_dbo, - args.quantization, + engine_args, ), ) proc.start() procs.append(proc) exit_code = 0 for proc in procs: - proc.join(timeout=args.timeout) + proc.join(timeout=timeout) if proc.exitcode is None: print(f"Killing process {proc.pid} that didn't stop within 5 minutes.") proc.kill() From 7c73ceb5812ace65e0d1b6ada3622b8b9f0400c0 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Sun, 21 Dec 2025 05:58:11 +0800 Subject: [PATCH 6/8] [Quantization] add marlin w4a8/w8a8 check (#31061) Signed-off-by: Jinzhen Lin --- .../layers/quantization/utils/marlin_utils.py | 12 ++++++++++++ .../layers/quantization/utils/marlin_utils_fp4.py | 12 ++++++++++++ .../layers/quantization/utils/marlin_utils_fp8.py | 4 ++++ 3 files changed, 28 insertions(+) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 66e979b505f0d..3de2b6509e460 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -594,9 +594,15 @@ def apply_awq_marlin_linear( a_scales = None if input_dtype == torch.int8: + assert quant_type == scalar_types.uint4, ( + "W8A8-INT8 is not supported by marlin kernel." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) a_scales = a_scales * input_global_scale elif input_dtype == torch.float8_e4m3fn: + assert quant_type == scalar_types.uint4, ( + "INT8 weight + FP8 activation is not supported." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) output = ops.gptq_marlin_gemm( @@ -649,9 +655,15 @@ def apply_rtn_marlin_linear( a_scales = None if input_dtype == torch.int8: + assert quant_type == scalar_types.uint4b8, ( + "W8A8-INT8 is not supported by marlin kernel." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) a_scales = a_scales * input_global_scale elif input_dtype == torch.float8_e4m3fn: + assert quant_type == scalar_types.uint4b8, ( + "INT8 weight + FP8 activation is not supported." + ) reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype) output = ops.gptq_marlin_gemm( diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index 876c724bf972d..4d0a34c3be119 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -154,6 +154,12 @@ def prepare_fp4_layer_for_marlin( ) is_nvfp4 = hasattr(layer, "weight_scale_2") + if input_dtype is not None and input_dtype.itemsize == 1: + if is_nvfp4: + raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.") + elif input_dtype != torch.float8_e4m3fn: + raise RuntimeError("MXFP4 weight + INT8 activation is not supported.") + group_size = 16 if is_nvfp4 else 32 part_size_n = layer.output_size_per_partition @@ -231,6 +237,12 @@ def prepare_moe_fp4_layer_for_marlin( ) is_nvfp4 = hasattr(layer, "w13_weight_scale_2") + if input_dtype is not None and input_dtype.itemsize == 1: + if is_nvfp4: + raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.") + elif input_dtype != torch.float8_e4m3fn: + raise RuntimeError("MXFP4 weight + INT8 activation is not supported.") + group_size = 16 if is_nvfp4 else 32 e = layer.num_experts diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 1fb5223b07d76..4d2f2fd71ad36 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -99,6 +99,8 @@ def prepare_fp8_layer_for_marlin( "be used leveraging the Marlin kernel. This may degrade " "performance for compute-heavy workloads." ) + if input_dtype is not None and input_dtype.itemsize == 1: + raise RuntimeError("Marlin W8A8 is not supported.") part_size_n = layer.output_size_per_partition part_size_k = layer.input_size_per_partition @@ -206,6 +208,8 @@ def prepare_moe_fp8_layer_for_marlin( "be used leveraging the Marlin kernel. This may degrade " "performance for compute-heavy workloads." ) + if input_dtype is not None and input_dtype.itemsize == 1: + raise RuntimeError("Marlin W8A8 is not supported.") e = layer.num_experts k = layer.hidden_size From 3e92b2b7acaa61335ecd7bea5eeed50388739194 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Sun, 21 Dec 2025 10:39:31 +0800 Subject: [PATCH 7/8] [BugFix]fix gpt-oss v1/completions response bug (#30608) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: princepride Signed-off-by: 汪志鹏 Co-authored-by: Chauncey Co-authored-by: Cyrus Leung Co-authored-by: bbrowning --- .../openai/test_response_api_with_harmony.py | 21 +++++++++++++++++++ tests/entrypoints/openai/test_serving_chat.py | 15 +++++-------- vllm/entrypoints/openai/serving_chat.py | 9 ++++---- vllm/tool_parsers/openai_tool_parser.py | 9 ++++++++ 4 files changed, 40 insertions(+), 14 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 6f2a50020699c..8ef0d7f277d5f 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib +import importlib.util import json import time @@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages( assert ( "aquarius" in output_text or "otter" in output_text or "tuesday" in output_text ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str): + response = await client.chat.completions.create( + model=model_name, + messages=[{"role": "user", "content": "What is the role of AI in medicine?"}], + temperature=0.0, + max_tokens=250, + ) + + choice = response.choices[0] + assert choice.finish_reason == "length", ( + f"Expected finish_reason='length', got {choice.finish_reason}" + ) + assert choice.message.content is not None, ( + "Content should not be None when truncated" + ) + assert len(choice.message.content) > 0, "Content should not be empty" diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 6fb074b8a19ba..d845913b8ee03 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -955,7 +955,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, ], ) @@ -983,7 +982,6 @@ class TestServingChatWithHarmony: input_messages_2, [ {"role": "system"}, - {"role": "developer"}, {"role": "user"}, # The analysis message should be dropped on subsequent inputs because # of the subsequent assistant message to the final channel. @@ -1043,7 +1041,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the second turn's input - req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) verify_harmony_messages( input_messages_2, @@ -1124,7 +1122,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the second turn's input - req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) verify_harmony_messages( input_messages_2, @@ -1205,7 +1203,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the second turn's input - req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_2, _ = serving_chat._make_request_with_harmony(req_2) verify_harmony_messages( input_messages_2, @@ -1255,7 +1253,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the third turn's input - req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_3, _ = serving_chat._make_request_with_harmony(req_3) verify_harmony_messages( input_messages_3, @@ -1318,7 +1316,7 @@ class TestServingChatWithHarmony: ) # Test the Harmony messages for the fourth turn's input - req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) input_messages_4, _ = serving_chat._make_request_with_harmony(req_4) verify_harmony_messages( input_messages_4, @@ -1374,7 +1372,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, # The reasoning that would have resulted in an analysis message is # dropped because of a later assistant message to the final channel. @@ -1406,7 +1403,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, { "role": "assistant", @@ -1436,7 +1432,6 @@ class TestServingChatWithHarmony: input_messages, [ {"role": "system"}, - {"role": "developer"}, {"role": "user", "content": messages[0]["content"]}, { "role": "assistant", diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index d437d1e5c3b06..88d87a3334955 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1828,10 +1828,11 @@ class OpenAIServingChat(OpenAIServing): messages.append(sys_msg) # Add developer message. - dev_msg = get_developer_message( - tools=request.tools if should_include_tools else None - ) - messages.append(dev_msg) + if request.tools: + dev_msg = get_developer_message( + tools=request.tools if should_include_tools else None + ) + messages.append(dev_msg) # Add user message. messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) diff --git a/vllm/tool_parsers/openai_tool_parser.py b/vllm/tool_parsers/openai_tool_parser.py index db92ea8982d70..da1a9c773f78f 100644 --- a/vllm/tool_parsers/openai_tool_parser.py +++ b/vllm/tool_parsers/openai_tool_parser.py @@ -79,6 +79,15 @@ class OpenAIToolParser(ToolParser): elif msg.channel == "commentary" and not msg.recipient: commentary_content = msg_text + # Extract partial content from the parser state if the generation was truncated + if parser.current_content: + if parser.current_channel == "final": + final_content = parser.current_content + elif ( + parser.current_channel == "commentary" and not parser.current_recipient + ): + commentary_content = parser.current_content + return ExtractedToolCallInformation( tools_called=len(tool_calls) > 0, tool_calls=tool_calls, From bb80f69bc98cbf062bf030cb11185f7ba526e28a Mon Sep 17 00:00:00 2001 From: Chauncey Date: Sun, 21 Dec 2025 11:24:34 +0800 Subject: [PATCH 8/8] add aarnphm and chaunceyjiang to the new tool_parser directory (#31088) Signed-off-by: chaunceyjiang --- .github/CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d6447649cd89a..4d7a366f05e37 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -15,6 +15,7 @@ /vllm/lora @jeejeelee /vllm/reasoning @aarnphm @chaunceyjiang /vllm/entrypoints @aarnphm @chaunceyjiang +/vllm/tool_parsers @aarnphm @chaunceyjiang /vllm/compilation @zou3519 @youkaichao @ProExpertProg /vllm/distributed/kv_transfer @NickLucche @ApostaC CMakeLists.txt @tlrmchlsmth @LucasWilkinson