From 1501a4070ec1f5e3f8f1ffd0099dc32d85a9ad98 Mon Sep 17 00:00:00 2001
From: Jeffrey Wang <jeffreywang@anyscale.com>
Date: Sat, 20 Dec 2025 02:29:31 -0800
Subject: [PATCH 1/8] [Bugfix] Read truncate_prompt_tokens from pooling_params
 in AsyncLLM.encode() (#31013)

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
---
 vllm/engine/protocol.py     |  6 +++++-
 vllm/v1/engine/async_llm.py | 16 +++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d94951a0cffc8..bf656cf23de65 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -71,7 +71,11 @@ class EngineClient(ABC):
         truncate_prompt_tokens: int | None = None,
         tokenization_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model."""
+        """Generate outputs for a request from a pooling model.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove this argument in v0.15.
+        """
         ...
 
     @abstractmethod
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a6ee241c41151..1cbe4718f2e5c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -4,6 +4,7 @@ import asyncio
 import os
 import socket
 import time
+import warnings
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, cast
@@ -627,6 +628,9 @@ class AsyncLLM(EngineClient):
 
         The caller of generate() iterates the returned AsyncGenerator,
         returning the RequestOutput back to the caller.
+
+        NOTE: truncate_prompt_tokens is deprecated in v0.14.
+        TODO: Remove truncate_prompt_tokens in v0.15.
         """
 
         try:
@@ -641,9 +645,19 @@ class AsyncLLM(EngineClient):
 
             if tokenization_kwargs is None:
                 tokenization_kwargs = {}
+
+            if truncate_prompt_tokens is not None:
+                warnings.warn(
+                    "The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
+                    "is deprecated and will be removed in v0.15. "
+                    "Please use `pooling_params.truncate_prompt_tokens` instead.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+
             _validate_truncation_size(
                 self.model_config.max_model_len,
-                truncate_prompt_tokens,
+                pooling_params.truncate_prompt_tokens,
                 tokenization_kwargs,
             )
 

From 560ae9638c00e761d6f1ca33882249dbfebbe8aa Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Sat, 20 Dec 2025 21:45:27 +0800
Subject: [PATCH 2/8] [XPU] enable fp8 online streaming quantization  (#30944)

Signed-off-by: Yan Ma <yan.ma@intel.com>
---
 .../model_executor/layers/quantization/fp8.py |  14 +-
 .../layers/quantization/ipex_quant.py         | 122 +++---------------
 2 files changed, 29 insertions(+), 107 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a86fb3d309525..30ca64238ae94 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -124,11 +124,13 @@ def get_fp8_moe_backend(
     block_quant: bool,
     moe_parallel_config: FusedMoEParallelConfig,
     with_lora_support: bool,
-) -> Fp8MoeBackend:
+) -> Fp8MoeBackend | None:
     """
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
     """
+    if current_platform.is_xpu():
+        return None
     if with_lora_support:
         return Fp8MoeBackend.TRITON
     # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
@@ -292,6 +294,13 @@ class Fp8Config(QuantizationConfig):
                 return UnquantizedLinearMethod()
             return XPUFp8LinearMethod(fp8_config)
         elif isinstance(layer, FusedMoE):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedFusedMoEMethod(layer.moe_config)
+
             return XPUFp8MoEMethod(fp8_config, layer)
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
@@ -1107,7 +1116,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
-            self.rocm_aiter_moe_enabled
+            current_platform.is_xpu()
+            or self.rocm_aiter_moe_enabled
             or self.use_marlin
             or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index f33ee43727f19..9de2924ec71b1 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -6,13 +6,8 @@ from typing import Any, Optional
 import torch
 from packaging import version
 from torch.nn import Module
-from torch.nn.parameter import Parameter
 
 from vllm._ipex_ops import ipex_ops as ops
-from vllm.model_executor.layers.fused_moe import (
-    FusedMoEMethodBase,
-    FusedMoeWeightScaleSupported,
-)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -24,14 +19,14 @@ from vllm.model_executor.layers.quantization import (
     QuantizationMethods,
 )
 from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
-from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
+from vllm.model_executor.layers.quantization.fp8 import (
+    Fp8Config,
+    Fp8LinearMethod,
+    Fp8OnlineMoEMethod,
+)
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
-from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    maybe_create_device_identity,
-)
-from vllm.model_executor.parameter import ModelWeightParameter
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 
 MIN_IPEX_VERSION = "2.6.0"
@@ -309,44 +304,15 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
     def __init__(self, quant_config: Fp8Config):
         super().__init__(quant_config)
 
-    def create_weights(
-        self,
-        layer: torch.nn.Module,
-        input_size_per_partition: int,
-        output_partition_sizes: list[int],
-        input_size: int,
-        output_size: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        maybe_create_device_identity()
-
-        output_size_per_partition = sum(output_partition_sizes)
-        weight_loader = extra_weight_attrs.get("weight_loader")
-        layer.logical_widths = output_partition_sizes
-        layer.input_size_per_partition = input_size_per_partition
-        layer.output_size_per_partition = output_size_per_partition
-        layer.orig_dtype = params_dtype
-        layer.weight_block_size = None
-        weight = ModelWeightParameter(
-            data=torch.empty(
-                output_size_per_partition,
-                input_size_per_partition,
-                dtype=params_dtype,
-            ),
-            input_dim=1,
-            output_dim=0,
-            weight_loader=weight_loader,
-        )
-        layer.register_parameter("weight", weight)
-
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
         # If checkpoint not serialized fp8, quantize the weights.
         if not self.quant_config.is_checkpoint_fp8_serialized:
             qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
             # Update the layer with the new values.
-            layer.weight = Parameter(qweight, requires_grad=False)
-            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            replace_parameter(layer, "weight", qweight.data)
+            replace_parameter(layer, "weight_scale", weight_scale.data)
             layer.input_scale = None
 
     def apply(
@@ -363,69 +329,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
         return output
 
 
-class XPUFp8MoEMethod(FusedMoEMethodBase):
+class XPUFp8MoEMethod(Fp8OnlineMoEMethod):
     def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
-        super().__init__(layer.moe_config)
+        super().__init__(quant_config, layer)
         self.quant_config = quant_config
 
-    def create_weights(
-        self,
-        layer: Module,
-        num_experts: int,
-        hidden_size: int,
-        intermediate_size_per_partition: int,
-        params_dtype: torch.dtype,
-        **extra_weight_attrs,
-    ):
-        layer.intermediate_size_per_partition = intermediate_size_per_partition
-        layer.hidden_size = hidden_size
-        layer.num_experts = num_experts
-        layer.orig_dtype = params_dtype
-        layer.weight_block_size = None
-        # WEIGHTS
-        w13_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                2 * intermediate_size_per_partition,
-                hidden_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_weight", w13_weight)
-        set_weight_attrs(w13_weight, extra_weight_attrs)
-
-        w2_weight = torch.nn.Parameter(
-            torch.empty(
-                num_experts,
-                hidden_size,
-                intermediate_size_per_partition,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_weight", w2_weight)
-        set_weight_attrs(w2_weight, extra_weight_attrs)
-
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
-        )
-        w2_weight_scale = torch.nn.Parameter(
-            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
-        )
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
-        )
-        # INPUT_SCALES
-        layer.w13_input_scale = None
-        layer.w2_input_scale = None
-
     def process_weights_after_loading(self, layer: Module) -> None:
+        if getattr(layer, "_already_called_process_weights_after_loading", False):
+            return
         if not self.quant_config.is_checkpoint_fp8_serialized:
             fp8_dtype = current_platform.fp8_dtype()
             w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
@@ -448,8 +359,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
                 w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
                     ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
                 )
-            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
-            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            replace_parameter(layer, "w13_weight", w13_weight)
+            replace_parameter(layer, "w2_weight", w2_weight)
+
         import intel_extension_for_pytorch as ipex
 
         ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts

From 54c892438479c0a8aec9a10b8db570034af92443 Mon Sep 17 00:00:00 2001
From: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com>
Date: Sat, 20 Dec 2025 13:22:04 -0500
Subject: [PATCH 3/8] [MoE Refactor][5/N] Isolate zero expert to LongCatFlash
 (#28891)

Signed-off-by: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com>
Signed-off-by: Dongjie Zou <85092850+baonudesifeizhai@users.noreply.github.com>
Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com>
Signed-off-by: Robert Shaw <robertgshaw2@gmail.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robertgshaw2@gmail.com>
---
 tests/test_routing_simulator.py               |   2 +-
 .../layers/fused_moe/__init__.py              |   4 +
 .../fused_moe/fused_moe_modular_method.py     |  10 +-
 vllm/model_executor/layers/fused_moe/layer.py |  53 +----
 .../fused_moe/unquantized_fused_moe_method.py |  10 +-
 .../layers/fused_moe/zero_expert_fused_moe.py | 189 ++++++++++++++++++
 .../layers/quantization/awq_marlin.py         |   2 +-
 .../layers/quantization/bitsandbytes.py       |   2 +-
 .../compressed_tensors_moe.py                 |  12 +-
 .../layers/quantization/experts_int8.py       |   2 +-
 .../model_executor/layers/quantization/fp8.py |  12 +-
 .../layers/quantization/gguf.py               |   2 +-
 .../layers/quantization/gptq_marlin.py        |   2 +-
 .../layers/quantization/modelopt.py           |   4 +-
 .../layers/quantization/moe_wna16.py          |   2 +-
 .../layers/quantization/mxfp4.py              |   4 +-
 .../layers/quantization/quark/quark_moe.py    |   6 +-
 .../model_executor/layers/quantization/rtn.py |   2 +-
 vllm/model_executor/models/longcat_flash.py   |  53 +++--
 19 files changed, 264 insertions(+), 109 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py

diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
index e8826eb441a24..44cbdeed45074 100644
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device):
         envs.environment_variables[env_name] = lambda s=strategy: s
 
         # Test the select_experts method
-        topk_weights, topk_ids, _ = fused_moe.select_experts(
+        topk_weights, topk_ids = fused_moe.select_experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 8fee4038b60b8..3d248e7fb9945 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
     UnquantizedFusedMoEMethod,
 )
 from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
+from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import (
+    ZeroExpertFusedMoE,
+)
 from vllm.triton_utils import HAS_TRITON
 
 _config: dict[str, Any] | None = None
@@ -54,6 +57,7 @@ __all__ = [
     "FusedMoEPrepareAndFinalize",
     "RoutingMethodType",
     "SharedFusedMoE",
+    "ZeroExpertFusedMoE",
     "activation_without_mul",
     "override_config",
     "get_config",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index 9c9bc2514bb4b..30ff1bf2f008a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -92,7 +92,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -110,10 +110,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
             expert_map=None if self.disable_expert_map else layer.expert_map,
         )
 
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6a65b06014bca..2e7267d56d838 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -32,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     RoutingMethodType,
 )
-from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     init_aiter_topK_meta_data,
 )
@@ -350,8 +349,6 @@ class FusedMoE(CustomOp):
         num_redundant_experts: int = 0,
         has_bias: bool = False,
         is_sequence_parallel=False,
-        zero_expert_num: int | None = 0,
-        zero_expert_type: str | None = None,
         expert_mapping: list[tuple[str, str, int, str]] | None = None,
         n_shared_experts: int | None = None,
         routing_method_type: int | None = None,
@@ -409,8 +406,6 @@ class FusedMoE(CustomOp):
 
         self.global_num_experts = num_experts + num_redundant_experts
         self.logical_num_experts = num_experts
-        self.zero_expert_num = zero_expert_num
-        self.zero_expert_type = zero_expert_type
 
         # Expert mapping used in self.load_weights
         self.expert_mapping = expert_mapping
@@ -1525,15 +1520,15 @@ class FusedMoE(CustomOp):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Route the input hidden states to the top-k experts based on the
         router logits.
 
         Returns:
-                (topk_weights, topk_ids, zero_expert_result)
-                (tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
-                The weights, expert ids, and zero expert computation result.
+                (topk_weights, topk_ids)
+                (tuple[torch.Tensor, torch.Tensor]):
+                The weights and expert ids.
 
             **Compatibility**: When EPLB is not enabled, the returned ids are
             equivalent to global logical ids, so should be compatible with
@@ -1655,23 +1650,7 @@ class FusedMoE(CustomOp):
 
         assert topk_ids.dtype == indices_type or indices_type is None
 
-        # Compute zero expert result if needed
-        if (
-            self.zero_expert_num is not None
-            and self.zero_expert_num > 0
-            and self.zero_expert_type is not None
-            and self.global_num_experts is not None
-        ):
-            zero_expert_result = zero_experts_compute_triton(
-                expert_indices=topk_ids,
-                expert_scales=topk_weights,
-                num_experts=self.global_num_experts,
-                zero_expert_type=self.zero_expert_type,
-                hidden_states=hidden_states,
-            )
-        else:
-            zero_expert_result = None
-        return topk_weights, topk_ids, zero_expert_result
+        return topk_weights, topk_ids
 
     def must_reduce_shared_expert_outputs(self) -> bool:
         """
@@ -1736,14 +1715,7 @@ class FusedMoE(CustomOp):
                 fused_output = torch.ops.vllm.moe_forward(
                     hidden_states, router_logits, self.layer_name
                 )
-            if self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(fused_output, tuple)
-                fused_output, zero_expert_result = fused_output
-                return (reduce_output(fused_output) + zero_expert_result)[
-                    ..., :og_hidden_states
-                ]
-            else:
-                return reduce_output(fused_output)[..., :og_hidden_states]
+            return reduce_output(fused_output)[..., :og_hidden_states]
         else:
             if current_platform.is_tpu() or current_platform.is_cpu():
                 # TODO: Once the OOM issue for the TPU backend is resolved, we
@@ -1841,13 +1813,6 @@ class FusedMoE(CustomOp):
                     final_hidden_states,
                 )
 
-            if self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, tuple)
-                assert self.shared_experts is None
-                final_hidden_states, zero_expert_result = final_hidden_states
-                if zero_expert_result is not None:
-                    final_hidden_states += zero_expert_result
-
             if not skip_result_store:
                 if self.shared_experts is None:
                     full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
@@ -2030,9 +1995,6 @@ class FusedMoE(CustomOp):
                     shared_output,
                     final_hidden_states,
                 )
-            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, tuple)
-                final_hidden_states, zero_expert_result = final_hidden_states
 
             def combine_output(states: torch.Tensor) -> torch.Tensor:
                 if do_naive_dispatch_combine:
@@ -2051,9 +2013,6 @@ class FusedMoE(CustomOp):
                     final_hidden_states[0],
                     combine_output(final_hidden_states[1]),
                 )
-            elif self.zero_expert_num is not None and self.zero_expert_num > 0:
-                assert isinstance(final_hidden_states, torch.Tensor)
-                return (combine_output(final_hidden_states), zero_expert_result)
             else:
                 return combine_output(final_hidden_states)
 
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 6182f10aa70f0..1ee7b65b22e3f 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -295,7 +295,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, zero_expert_result = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -336,13 +336,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 expert_map=layer.expert_map,
             )
 
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
 
     def forward_cpu(
         self,
diff --git a/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
new file mode 100644
index 0000000000000..97d21767f4fc3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from contextlib import contextmanager
+
+import torch
+from torch import nn
+
+from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+
+class ZeroExpertFusedMoE(FusedMoE):
+    """
+    A FusedMoE operation that also computes the results of zero experts.
+    Zero experts perform identity operations (scaled pass-through) instead
+    of full MLP computations.
+
+    This class uses memoization to avoid redundant routing computation:
+    routing is computed once and reused for both zero expert computation
+    and the main FusedMoE forward pass.
+    """
+
+    def __init__(
+        self,
+        zero_expert_num: int,
+        zero_expert_type: str,
+        router: nn.Module,
+        **kwargs,
+    ):
+        # ZeroExpertFusedMoE manages its own custom_routing_function for memoization
+        assert (
+            "custom_routing_function" not in kwargs
+            or kwargs.get("custom_routing_function") is None
+        ), (
+            "ZeroExpertFusedMoE does not support external custom_routing_function. "
+            "It manages its own for routing memoization."
+        )
+
+        # Automatically slice router's e_score_correction_bias to only include
+        # real experts (not zero_experts) for the base FusedMoE.
+        # The full bias will be used temporarily in forward() for routing.
+        if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs:
+            num_real_experts = kwargs["num_experts"]
+            router_bias = router.e_score_correction_bias
+            user_bias = kwargs.get("e_score_correction_bias")
+
+            # Use router's bias if:
+            # 1. User didn't provide bias, or
+            # 2. User provided full bias (same size as router)
+            if user_bias is None or user_bias.shape[0] == router_bias.shape[0]:
+                kwargs["e_score_correction_bias"] = router_bias[:num_real_experts]
+
+        # FusedMoE no longer accepts zero_expert_num/zero_expert_type.
+        # We handle zero experts ourselves in forward().
+        super().__init__(**kwargs)
+        # Store the actual zero_expert_num and zero_expert_type for our own use
+        self._actual_zero_expert_num = zero_expert_num
+        self._actual_zero_expert_type = zero_expert_type
+        self._router = router  # Full router (includes zero experts)
+
+        # Expose zero_expert_num and zero_expert_type as attributes for
+        # compatibility with quantization methods that check these attributes
+        self.zero_expert_num = 0
+        self.zero_expert_type = None
+
+        # Memoization state for routing results
+        self._memoized_topk_weights: torch.Tensor | None = None
+        self._memoized_topk_ids: torch.Tensor | None = None
+
+        # Create custom_routing_function to reuse memoized routing results
+        def custom_routing_function(hidden_states, gating_output, topk, renormalize):
+            """Return memoized `topk_weights` and `topk_ids`."""
+            if self._memoized_topk_weights is None or self._memoized_topk_ids is None:
+                raise RuntimeError(
+                    "ZeroExpertFusedMoE: routing results not memoized. "
+                    "Call select_experts first to compute routing."
+                )
+            return self._memoized_topk_weights, self._memoized_topk_ids
+
+        self.custom_routing_function = custom_routing_function
+
+    @contextmanager
+    def _temporarily_set_attrs(self, **attrs):
+        """
+        Temporarily set attributes using object.__setattr__ and restore them.
+
+        This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues.
+        When PyTorch Dynamo traces the forward pass, it cannot handle
+        nn.Module.__setattr__ calls (which include parameter registration logic),
+        resulting in "Unsupported" errors. Using object.__setattr__ directly
+        sets the attribute without triggering nn.Module's custom __setattr__,
+        allowing Dynamo to trace the code successfully.
+        """
+        originals = {key: getattr(self, key) for key in attrs}
+        try:
+            for key, value in attrs.items():
+                object.__setattr__(self, key, value)
+            yield
+        finally:
+            for key, value in originals.items():
+                object.__setattr__(self, key, value)
+
+    def _compute_zero_expert_result(
+        self,
+        hidden_states: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> torch.Tensor | None:
+        """Compute zero expert results using pre-computed routing."""
+        if (
+            self._actual_zero_expert_num is None
+            or self._actual_zero_expert_num <= 0
+            or self._actual_zero_expert_type is None
+        ):
+            return None
+
+        return zero_experts_compute_triton(
+            expert_indices=topk_ids.clone(),
+            expert_scales=topk_weights.clone(),
+            num_experts=self.logical_num_experts,
+            zero_expert_type=self._actual_zero_expert_type,
+            hidden_states=hidden_states,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,  # Full logits including zero experts
+    ) -> torch.Tensor:
+        """
+        Forward pass with zero expert support and routing memoization.
+
+        Args:
+            hidden_states: Input hidden states
+            router_logits: Full router logits (including zero experts)
+
+        Returns:
+            Combined output from real experts and zero experts
+        """
+        # Prepare temporary attribute overrides for routing computation
+        temp_attrs = {
+            "custom_routing_function": None,  # Disable for first routing
+        }
+        if self._router is not None:
+            temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias
+
+        # Compute routing with temporary attributes
+        # Pass full router_logits (including zero experts) so that zero experts
+        # can be properly identified in topk_ids
+        with self._temporarily_set_attrs(**temp_attrs):
+            topk_weights, topk_ids = self.select_experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits,  # Full logits (includes zero experts)
+            )
+
+        # Compute zero expert result if needed
+        zero_expert_result = self._compute_zero_expert_result(
+            hidden_states=hidden_states,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+        )
+
+        # Memoize routing results for reuse in super().forward()
+        self._memoized_topk_weights = topk_weights
+        self._memoized_topk_ids = topk_ids
+
+        # Slice router_logits for real experts only
+        router_logits_sliced = router_logits[..., : self.logical_num_experts]
+
+        # Compute real expert results (will reuse memoized routing via
+        # custom_routing_function)
+        # zero_expert_num is already 0, so FusedMoE won't handle zero experts
+        fused_out = super().forward(
+            hidden_states=hidden_states,
+            router_logits=router_logits_sliced,
+        )
+
+        # Combine results
+        # Both zero_expert_result and fused_out are computed from the same
+        # hidden_states, so they should be on the same device.
+        if zero_expert_result is not None:
+            fused_out = fused_out + zero_expert_result
+
+        # Clear memoization after use
+        self._memoized_topk_weights = None
+        self._memoized_topk_ids = None
+
+        return fused_out
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 314848721a80a..602d02d2f15a4 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -764,7 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert layer.activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 1fd959cb3423d..efe5677045e4b 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -500,7 +500,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index fc359a3067a9c..f4038801c266b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -574,7 +574,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
                 e_score_correction_bias=layer.e_score_correction_bias,
             )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1166,7 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1403,7 +1403,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1765,7 +1765,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             f"{layer.activation} not supported for Marlin MoE."
         )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1991,7 +1991,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -2607,7 +2607,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
             )
         assert self.moe_quant_config is not None
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 11097cf36f5ca..56b11b22f7ff5 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -142,7 +142,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 30ca64238ae94..4b2438133dd6b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1292,13 +1292,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 )
 
-        select_result = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
 
-        topk_weights, topk_ids, zero_expert_result = select_result
-
         if self.rocm_aiter_moe_enabled:
             from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
                 rocm_aiter_fused_experts,
@@ -1353,13 +1351,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
             )
 
-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result
 
 
 class Fp8OnlineMoEMethod(Fp8MoEMethod):
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 9dd734f2fea6a..9600bb42295dc 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -639,7 +639,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
                 "fused GGUF MoE method."
             )
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 347c7b2008d12..d2dafca99a230 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -900,7 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert layer.activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index aa3937d4c03ff..54e8673fcfbb8 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -796,7 +796,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
             )
 
         # Expert selection
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -1599,7 +1599,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
             x_routing, _ = x
         else:
             x_routing = x
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x_routing,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 4bedb951a33f5..513f6f7b21abc 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -370,7 +370,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         assert layer.activation == "silu", "Only SiLU activation is supported."
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 832925825c453..dc0fbfa7df35a 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -896,7 +896,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            topk_weights, topk_ids, _ = layer.select_experts(
+            topk_weights, topk_ids = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
             )
@@ -990,7 +990,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         ):
             from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
 
-            topk_weights, topk_ids, _ = layer.select_experts(
+            topk_weights, topk_ids = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
             )
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 0b9b098afb1f6..81970480319ab 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -338,7 +338,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -530,7 +530,7 @@ class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
@@ -738,7 +738,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index b2ecb0b175f81..dce9c661ec332 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -359,7 +359,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
         x: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        topk_weights, topk_ids, _ = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
         )
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index c5441283f9711..774737387639b 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -46,7 +46,7 @@ from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import FusedMoE, ZeroExpertFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -179,7 +179,7 @@ class FlashConfig(PretrainedConfig):
         self.intermediate_size = (
             self.ffn_hidden_size
             if hasattr(self, "ffn_hidden_size")
-            else self.intermediate_size
+            else intermediate_size
         )
         if hasattr(self, "moe_intermediate_size"):
             self.moe_intermediate_size = self.moe_intermediate_size
@@ -280,10 +280,6 @@ class LongcatMoe(nn.Module):
     ):
         super().__init__()
         self.hidden_size = hidden_size
-        self.zero_expert_num = config.zero_expert_num
-        self.zero_expert_type = config.zero_expert_type
-        self.routed_scaling_factor = config.routed_scaling_factor
-        self.enable_eplb = enable_eplb
         # Gate always runs at half / full precision for now.
         self.rounter_params_dtype = params_dtype
         if config.router_dtype == "float32":
@@ -291,25 +287,27 @@ class LongcatMoe(nn.Module):
 
         self.router = LongcatRouter(
             config=config,
-            zero_expert_num=self.zero_expert_num,
+            zero_expert_num=config.zero_expert_num,
             rounter_params_dtype=self.rounter_params_dtype,
             prefix=f"{prefix}.gate",
         )
 
-        self.experts = FusedMoE(
+        assert config.zero_expert_num is not None
+        assert config.zero_expert_type is not None
+        self.experts = ZeroExpertFusedMoE(
+            zero_expert_num=config.zero_expert_num,
+            zero_expert_type=config.zero_expert_type,
+            router=self.router,
             num_experts=num_experts,
             top_k=top_k,
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             reduce_results=True,
             params_dtype=params_dtype,
-            e_score_correction_bias=self.router.e_score_correction_bias,
             renormalize=False,
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
-            zero_expert_num=self.zero_expert_num,
-            zero_expert_type=self.zero_expert_type,
-            enable_eplb=self.enable_eplb,
+            enable_eplb=enable_eplb,
             routed_scaling_factor=config.routed_scaling_factor,
         )
 
@@ -317,11 +315,34 @@ class LongcatMoe(nn.Module):
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
 
-        router_logits = self.router(hidden_states.to(self.rounter_params_dtype))
-        final_hidden_states = self.experts(
-            hidden_states=hidden_states, router_logits=router_logits
+        # Align to FusedMoE padded hidden size to avoid dim mismatch
+        padded_hidden = self.experts.hidden_size
+        if hidden_dim < padded_hidden:
+            hidden_states_padded = torch.nn.functional.pad(
+                hidden_states,
+                (0, padded_hidden - hidden_dim),
+                mode="constant",
+                value=0.0,
+            )
+        else:
+            hidden_states_padded = hidden_states
+
+        router_logits_full = self.router(
+            hidden_states_padded.to(self.rounter_params_dtype)
         )
 
+        # ZeroExpertFusedMoE handles routing memoization and zero expert computation
+        # internally. Pass full router_logits (including zero experts) so that
+        # zero experts can be properly identified in routing.
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states_padded,
+            router_logits=router_logits_full,  # Full logits (includes zero experts)
+        )
+
+        # Crop back to original hidden dimension if padded earlier
+        if padded_hidden != hidden_dim:
+            final_hidden_states = final_hidden_states[..., :hidden_dim]
+
         return final_hidden_states.view(num_tokens, hidden_dim)
 
 
@@ -419,6 +440,7 @@ class FlashDecoderLayer(nn.Module):
         hidden_states = self.self_attn[0](
             positions=positions,
             hidden_states=hidden_states,
+            llama_4_scaling=None,
         )
 
         hidden_states, residual = self.post_attention_layernorm[0](
@@ -438,6 +460,7 @@ class FlashDecoderLayer(nn.Module):
         hidden_states = self.self_attn[1](
             positions=positions,
             hidden_states=hidden_states,
+            llama_4_scaling=None,
         )
         hidden_states, residual = self.post_attention_layernorm[1](
             hidden_states, residual

From ee52d9901d368da35a80048c299ccb53e78a9a01 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Sun, 21 Dec 2025 04:02:57 +0800
Subject: [PATCH 4/8] [Quantization] support logical_widths for fp8 marlin
 (#30962)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../layers/quantization/utils/marlin_utils_fp8.py      | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 93d238a0524d8..1fb5223b07d76 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -142,10 +142,20 @@ def prepare_fp8_layer_for_marlin(
     # marlin kernel only support channel-wise and group-wise quantization
     # we need to convert the scales
     if weight_block_size is None:
+        logical_widths = getattr(layer, "logical_widths", [])
         if scales.nelement() == 1:
             # tensor-wise quantization -> channel-wise quantization
             # (1, 1) =>(repeat)=> (1, size_n)
             scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
+        elif scales.nelement() == len(logical_widths):
+            # tensor-wise quantization with logical_widths ->
+            #    channel-wise quantization
+            assert sum(logical_widths) == part_size_n, (
+                f"Sum of logical_widths ({sum(logical_widths)}) must be equal "
+                f"to part_size_n ({part_size_n})"
+            )
+            lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64)
+            scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1)
         elif scales.nelement() > 1 and scales.nelement() != part_size_n:
             assert part_size_n % scales.nelement() == 0
             s_size = scales.nelement()

From ae0770fa6bcc58892692f0d1b2a3f0a300dbe82a Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 20 Dec 2025 16:48:49 -0500
Subject: [PATCH 5/8] [CI] Fix H200 Distributed test (#31054)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 .buildkite/test-amd.yaml                    |   6 +-
 .buildkite/test-pipeline.yaml               |   6 +-
 .buildkite/test_areas/distributed.yaml      |   4 +-
 examples/offline_inference/data_parallel.py | 164 +++++---------------
 4 files changed, 51 insertions(+), 129 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 9a770869b1d17..bd00c47df8cb0 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1254,13 +1254,13 @@ steps:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
@@ -1508,7 +1508,7 @@ steps:
     - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f644504a5b937..3c823fc872b05 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1109,13 +1109,13 @@ steps:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
-    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
+    - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 
 - label: Distributed Tests (2 GPUs) # 68min
   timeout_in_minutes: 90
@@ -1334,7 +1334,7 @@ steps:
     - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 52d57c99fcfb5..1a3739cc2417a 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -145,7 +145,7 @@ steps:
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
+    - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 - label: Distributed Tests (2 GPUs)(B200)
@@ -171,7 +171,7 @@ steps:
   - tests/distributed/
   - tests/examples/offline_inference/data_parallel.py
   commands:
-    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
 
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
   timeout_in_minutes: 30
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index be0b846995a92..bcf1ba307eff1 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -5,25 +5,25 @@ Usage:
 Single node:
     python examples/offline_inference/data_parallel.py \
             --model="ibm-research/PowerMoE-3b" \
-            --dp-size=2 \
-            --tp-size=2
+            -dp=2 \
+            -tp=2
 
 Multi-node:
     Node 0 (assume the node has ip of 10.99.48.128):
             python examples/offline_inference/data_parallel.py \
                     --model="ibm-research/PowerMoE-3b" \
-                    --dp-size=2 \
-                    --tp-size=2 \
-                    --node-size=2 \
+                    -dp=2 \
+                    -tp=2 \
+                    --nnodes=2 \
                     --node-rank=0 \
                     --master-addr=10.99.48.128 \
                     --master-port=13345
     Node 1:
             python examples/offline_inference/data_parallel.py \
                     --model="ibm-research/PowerMoE-3b" \
-                    --dp-size=2 \
-                    --tp-size=2 \
-                    --node-size=2 \
+                    -dp=2 \
+                    -tp=2 \
+                    --nnodes=2 \
                     --node-rank=1 \
                     --master-addr=10.99.48.128 \
                     --master-port=13345
@@ -32,103 +32,40 @@ Multi-node:
 import os
 from time import sleep
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.platforms import current_platform
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.network_utils import get_open_port
 
 
-def parse_args():
-    import argparse
+def create_parser():
+    parser = FlexibleArgumentParser(description="Data Parallel Inference")
 
-    parser = argparse.ArgumentParser(description="Data Parallel Inference")
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="ibm-research/PowerMoE-3b",
-        help="Model name or path",
-    )
-    parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
-    parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
-    parser.add_argument(
-        "--node-size", type=int, default=1, help="Total number of nodes"
-    )
-    parser.add_argument(
-        "--node-rank", type=int, default=0, help="Rank of the current node"
-    )
-    parser.add_argument(
-        "--master-addr", type=str, default="", help="Master node IP address"
-    )
-    parser.add_argument("--master-port", type=int, default=0, help="Master node port")
-    parser.add_argument(
-        "--enforce-eager", action="store_true", help="Enforce eager mode execution."
-    )
-    parser.add_argument(
-        "--trust-remote-code", action="store_true", help="Trust remote code."
-    )
-    parser.add_argument(
-        "--max-num-seqs",
-        type=int,
-        default=64,
-        help=("Maximum number of sequences to be processed in a single iteration."),
-    )
-    parser.add_argument(
-        "--max-model-len",
-        type=int,
-        help=("Maximum number of tokens to be processed in a single iteration."),
+    # Add all engine args
+    EngineArgs.add_cli_args(parser)
+    parser.set_defaults(
+        model="ibm-research/PowerMoE-3b",
+        enable_expert_parallel=True,
     )
+
+    # Add timeout (not in EngineArgs)
     parser.add_argument(
         "--timeout",
         type=int,
         default=300,
-        help=("Number of seconds before unresponsive process is killed."),
+        help="Number of seconds before unresponsive process is killed.",
     )
-    parser.add_argument(
-        "--gpu-memory-utilization",
-        type=float,
-        default=0.8,
-        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
-    )
-    parser.add_argument(
-        "--enable-dbo",
-        action="store_true",
-        help=("Enable microbatched execution"),
-    )
-    parser.add_argument(
-        "--compilation-config",
-        type=int,
-        help=("Compilation optimization (O) mode 0-3."),
-    )
-    parser.add_argument(
-        "--quantization",
-        type=str,
-    )
-    parser.add_argument(
-        "--disable-expert-parallel",
-        dest="enable_expert_parallel",
-        action="store_false",
-        help="Disable expert parallel (default: enabled).",
-    )
-    parser.set_defaults(enable_expert_parallel=True)
-    return parser.parse_args()
+
+    return parser
 
 
 def main(
-    model,
     dp_size,
     local_dp_rank,
     global_dp_rank,
     dp_master_ip,
     dp_master_port,
-    GPUs_per_dp_rank,
-    enforce_eager,
-    enable_expert_parallel,
-    trust_remote_code,
-    max_num_seqs,
-    max_model_len,
-    compilation_config,
-    gpu_memory_utilization,
-    enable_dbo,
-    quantization,
+    engine_args,
 ):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@@ -173,19 +110,7 @@ def main(
     )
 
     # Create an LLM.
-    llm = LLM(
-        model=model,
-        tensor_parallel_size=GPUs_per_dp_rank,
-        enforce_eager=enforce_eager,
-        enable_expert_parallel=enable_expert_parallel,
-        trust_remote_code=trust_remote_code,
-        max_num_seqs=max_num_seqs,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=gpu_memory_utilization,
-        enable_dbo=enable_dbo,
-        quantization=quantization,
-        compilation_config=compilation_config,
-    )
+    llm = LLM(**engine_args)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
     for i, output in enumerate(outputs):
@@ -204,22 +129,29 @@ def main(
 
 
 if __name__ == "__main__":
-    args = parse_args()
+    parser = create_parser()
+    args = vars(parser.parse_args())
 
-    dp_size = args.dp_size
-    tp_size = args.tp_size
-    node_size = args.node_size
-    node_rank = args.node_rank
+    # Extract DP-specific args
+    dp_size = args.pop("data_parallel_size")
+    nnodes = args.get("nnodes", 1)
+    node_rank = args.get("node_rank", 0)
+    master_addr = args.get("master_addr", "")
+    master_port = args.get("master_port", 0)
+    timeout = args.pop("timeout")
 
-    if node_size == 1:
+    # Remaining args are engine args
+    engine_args = args
+
+    if nnodes == 1:
         dp_master_ip = "127.0.0.1"
         dp_master_port = get_open_port()
     else:
-        dp_master_ip = args.master_addr
-        dp_master_port = args.master_port
+        dp_master_ip = master_addr
+        dp_master_port = master_port
 
-    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
-    dp_per_node = dp_size // node_size
+    assert dp_size % nnodes == 0, "dp_size should be divisible by nnodes"
+    dp_per_node = dp_size // nnodes
 
     from multiprocessing import Process
 
@@ -235,29 +167,19 @@ if __name__ == "__main__":
         proc = Process(
             target=main,
             args=(
-                args.model,
                 dp_size,
                 local_dp_rank,
                 global_dp_rank,
                 dp_master_ip,
                 dp_master_port,
-                tp_size,
-                args.enforce_eager,
-                args.enable_expert_parallel,
-                args.trust_remote_code,
-                args.max_num_seqs,
-                args.max_model_len,
-                args.compilation_config,
-                args.gpu_memory_utilization,
-                args.enable_dbo,
-                args.quantization,
+                engine_args,
             ),
         )
         proc.start()
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join(timeout=args.timeout)
+        proc.join(timeout=timeout)
         if proc.exitcode is None:
             print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
             proc.kill()

From 7c73ceb5812ace65e0d1b6ada3622b8b9f0400c0 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Sun, 21 Dec 2025 05:58:11 +0800
Subject: [PATCH 6/8] [Quantization] add marlin w4a8/w8a8 check (#31061)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
---
 .../layers/quantization/utils/marlin_utils.py        | 12 ++++++++++++
 .../layers/quantization/utils/marlin_utils_fp4.py    | 12 ++++++++++++
 .../layers/quantization/utils/marlin_utils_fp8.py    |  4 ++++
 3 files changed, 28 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 66e979b505f0d..3de2b6509e460 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -594,9 +594,15 @@ def apply_awq_marlin_linear(
 
     a_scales = None
     if input_dtype == torch.int8:
+        assert quant_type == scalar_types.uint4, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
         a_scales = a_scales * input_global_scale
     elif input_dtype == torch.float8_e4m3fn:
+        assert quant_type == scalar_types.uint4, (
+            "INT8 weight + FP8 activation is not supported."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
 
     output = ops.gptq_marlin_gemm(
@@ -649,9 +655,15 @@ def apply_rtn_marlin_linear(
 
     a_scales = None
     if input_dtype == torch.int8:
+        assert quant_type == scalar_types.uint4b8, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
         a_scales = a_scales * input_global_scale
     elif input_dtype == torch.float8_e4m3fn:
+        assert quant_type == scalar_types.uint4b8, (
+            "INT8 weight + FP8 activation is not supported."
+        )
         reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
 
     output = ops.gptq_marlin_gemm(
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index 876c724bf972d..4d0a34c3be119 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -154,6 +154,12 @@ def prepare_fp4_layer_for_marlin(
     )
 
     is_nvfp4 = hasattr(layer, "weight_scale_2")
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
     group_size = 16 if is_nvfp4 else 32
 
     part_size_n = layer.output_size_per_partition
@@ -231,6 +237,12 @@ def prepare_moe_fp4_layer_for_marlin(
     )
 
     is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
     group_size = 16 if is_nvfp4 else 32
 
     e = layer.num_experts
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 1fb5223b07d76..4d2f2fd71ad36 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -99,6 +99,8 @@ def prepare_fp8_layer_for_marlin(
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads."
     )
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        raise RuntimeError("Marlin W8A8 is not supported.")
 
     part_size_n = layer.output_size_per_partition
     part_size_k = layer.input_size_per_partition
@@ -206,6 +208,8 @@ def prepare_moe_fp8_layer_for_marlin(
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads."
     )
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        raise RuntimeError("Marlin W8A8 is not supported.")
 
     e = layer.num_experts
     k = layer.hidden_size

From 3e92b2b7acaa61335ecd7bea5eeed50388739194 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Sun, 21 Dec 2025 10:39:31 +0800
Subject: [PATCH 7/8] [BugFix]fix gpt-oss v1/completions response bug (#30608)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: princepride <wangzhipeng628@gmail.com>
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: bbrowning <bbrownin@redhat.com>
---
 .../openai/test_response_api_with_harmony.py  | 21 +++++++++++++++++++
 tests/entrypoints/openai/test_serving_chat.py | 15 +++++--------
 vllm/entrypoints/openai/serving_chat.py       |  9 ++++----
 vllm/tool_parsers/openai_tool_parser.py       |  9 ++++++++
 4 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 6f2a50020699c..8ef0d7f277d5f 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
+import importlib.util
 import json
 import time
 
@@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages(
     assert (
         "aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
     )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
+    response = await client.chat.completions.create(
+        model=model_name,
+        messages=[{"role": "user", "content": "What is the role of AI in medicine?"}],
+        temperature=0.0,
+        max_tokens=250,
+    )
+
+    choice = response.choices[0]
+    assert choice.finish_reason == "length", (
+        f"Expected finish_reason='length', got {choice.finish_reason}"
+    )
+    assert choice.message.content is not None, (
+        "Content should not be None when truncated"
+    )
+    assert len(choice.message.content) > 0, "Content should not be empty"
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 6fb074b8a19ba..d845913b8ee03 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -955,7 +955,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
             ],
         )
@@ -983,7 +982,6 @@ class TestServingChatWithHarmony:
             input_messages_2,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user"},
                 # The analysis message should be dropped on subsequent inputs because
                 # of the subsequent assistant message to the final channel.
@@ -1043,7 +1041,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1124,7 +1122,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1205,7 +1203,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the second turn's input
-        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
         verify_harmony_messages(
             input_messages_2,
@@ -1255,7 +1253,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the third turn's input
-        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
         verify_harmony_messages(
             input_messages_3,
@@ -1318,7 +1316,7 @@ class TestServingChatWithHarmony:
         )
 
         # Test the Harmony messages for the fourth turn's input
-        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+        req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
         input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
         verify_harmony_messages(
             input_messages_4,
@@ -1374,7 +1372,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 # The reasoning that would have resulted in an analysis message is
                 # dropped because of a later assistant message to the final channel.
@@ -1406,7 +1403,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 {
                     "role": "assistant",
@@ -1436,7 +1432,6 @@ class TestServingChatWithHarmony:
             input_messages,
             [
                 {"role": "system"},
-                {"role": "developer"},
                 {"role": "user", "content": messages[0]["content"]},
                 {
                     "role": "assistant",
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d437d1e5c3b06..88d87a3334955 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1828,10 +1828,11 @@ class OpenAIServingChat(OpenAIServing):
         messages.append(sys_msg)
 
         # Add developer message.
-        dev_msg = get_developer_message(
-            tools=request.tools if should_include_tools else None
-        )
-        messages.append(dev_msg)
+        if request.tools:
+            dev_msg = get_developer_message(
+                tools=request.tools if should_include_tools else None
+            )
+            messages.append(dev_msg)
 
         # Add user message.
         messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
diff --git a/vllm/tool_parsers/openai_tool_parser.py b/vllm/tool_parsers/openai_tool_parser.py
index db92ea8982d70..da1a9c773f78f 100644
--- a/vllm/tool_parsers/openai_tool_parser.py
+++ b/vllm/tool_parsers/openai_tool_parser.py
@@ -79,6 +79,15 @@ class OpenAIToolParser(ToolParser):
                 elif msg.channel == "commentary" and not msg.recipient:
                     commentary_content = msg_text
 
+        # Extract partial content from the parser state if the generation was truncated
+        if parser.current_content:
+            if parser.current_channel == "final":
+                final_content = parser.current_content
+            elif (
+                parser.current_channel == "commentary" and not parser.current_recipient
+            ):
+                commentary_content = parser.current_content
+
         return ExtractedToolCallInformation(
             tools_called=len(tool_calls) > 0,
             tool_calls=tool_calls,

From bb80f69bc98cbf062bf030cb11185f7ba526e28a Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sun, 21 Dec 2025 11:24:34 +0800
Subject: [PATCH 8/8] add aarnphm and chaunceyjiang to the new tool_parser
 directory (#31088)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .github/CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d6447649cd89a..4d7a366f05e37 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -15,6 +15,7 @@
 /vllm/lora @jeejeelee
 /vllm/reasoning @aarnphm @chaunceyjiang
 /vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/tool_parsers @aarnphm @chaunceyjiang
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 /vllm/distributed/kv_transfer @NickLucche @ApostaC
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson