From 4f510bc2a175a6eebfdceafa7013da8a3865eb38 Mon Sep 17 00:00:00 2001 From: yiz-liu <136800916+yiz-liu@users.noreply.github.com> Date: Wed, 20 Aug 2025 00:18:41 +0800 Subject: [PATCH] [Model] Removes redundant all-reduce operation in Qwen3MoeSparseMoeBlock (#23169) Signed-off-by: Yizhou Liu --- vllm/model_executor/models/qwen3_moe.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 61b16b6a1d2d8..05bbb0d2e8995 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -139,7 +139,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module): top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, - reduce_results=False, + reduce_results=True, renormalize=config.norm_topk_prob, quant_config=quant_config, prefix=f"{prefix}.experts", @@ -163,10 +163,6 @@ class Qwen3MoeSparseMoeBlock(nn.Module): final_hidden_states = self.experts(hidden_states=hidden_states, router_logits=router_logits) - if self.tp_size > 1: - final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 - final_hidden_states) - return final_hidden_states.view(orig_shape)