From 72c5dd0310d34958ac405032f95e77245679f61a Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 2 Oct 2025 17:29:49 -0400 Subject: [PATCH] Fix MTP with deepep_low_latency (#25904) Signed-off-by: Matthew Bonanni Signed-off-by: yewentao256 --- vllm/model_executor/layers/fused_moe/layer.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 8de1d14d46b33..9a7ca7b6d1240 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1899,6 +1899,15 @@ class FusedMoE(CustomOp): staged_hidden_states.copy_(hidden_states, non_blocking=True) staged_router_logits.copy_(router_logits, non_blocking=True) + # If there are shared experts but we are not using a modular kernel, + # the shared experts must be called here + if (not isinstance(self.quant_method.fused_experts, + FusedMoEModularKernel) + and self.shared_experts is not None): + shared_output = self.shared_experts(staged_hidden_states) + else: + shared_output = None + # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, @@ -1922,8 +1931,13 @@ class FusedMoE(CustomOp): logical_replica_count=self.logical_replica_count, ) - assert self.shared_experts is None or isinstance( - final_hidden_states, tuple) + if shared_output is not None: + assert not isinstance(final_hidden_states, tuple) + assert self.shared_experts is not None + final_hidden_states = ( + shared_output, + final_hidden_states, + ) if self.zero_expert_num is not None and self.zero_expert_num > 0: assert isinstance(final_hidden_states, tuple)