[Feature] Enable TP + EP shared_experts overlap with router, 3.7% E2E performance improvement (#28164)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-01-22 04:24:27 +08:00 · 2025-11-05 20:21:08 -05:00 · 2025-11-05 20:21:08 -05:00 · d71af5f502
commit d71af5f502
parent 90189c71a9
2 changed files with 16 additions and 8 deletions
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -1178,7 +1178,7 @@ class FusedMoE(CustomOp):
        hidden_size: Input hidden state size of the transformer
        intermediate_size: Intermediate size of the experts
        params_dtype: Data type for the parameters.
-        reduce_results: Whether to all all_reduce on the output of the layer
+        reduce_results: Whether to all_reduce on the output of the layer
        renormalize: Whether to renormalize the logits in the fused_moe kernel
        quant_config: Quantization configure.
        enable_eplb: Whether to enable expert parallelism load balancer.
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@ -3,7 +3,10 @@

 import torch

-from vllm.distributed import tensor_model_parallel_all_reduce
+from vllm.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE


@ -25,16 +28,13 @@ class SharedFusedMoE(FusedMoE):
        super().__init__(**kwargs)
        self._shared_experts = shared_experts

-        # Disable shared expert overlap if EP is disabled or we are not using
+        # Disable shared expert overlap if we are not using
        # flashinfer + DP since there is nothing to be gained in this case.
        # Disabling the overlap optimization also prevents the shared experts
        # from being hidden from torch.compile.
        self.use_overlapped = (
            use_overlapped
-            and not (
-                self.use_ep
-                or (self.use_flashinfer_cutlass_kernels and self.dp_size > 1)
-            )
+            and not (self.use_flashinfer_cutlass_kernels and self.dp_size > 1)
            and self._shared_experts is not None
        )

@ -65,7 +65,7 @@ class SharedFusedMoE(FusedMoE):
                # should have been created with reduce_results=False.
                if (
                    self.reduce_results
-                    and self.tp_size > 1
+                    and get_tensor_model_parallel_world_size() > 1
                    and self.must_reduce_shared_expert_outputs()
                ):
                    shared_out = tensor_model_parallel_all_reduce(shared_out)
@ -81,4 +81,12 @@ class SharedFusedMoE(FusedMoE):
                hidden_states=hidden_states,
                router_logits=router_logits,
            )
+            # ensure early TP reduction of shared expert outputs when required
+            if (
+                shared_out is not None
+                and self.reduce_results
+                and get_tensor_model_parallel_world_size() > 1
+                and self.must_reduce_shared_expert_outputs()
+            ):
+                shared_out = tensor_model_parallel_all_reduce(shared_out)
        return shared_out, fused_out