[Feature] Enable TP + EP shared_experts overlap with router, 3.7% E2E performance improvement (#28164)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye 2025-11-05 20:21:08 -05:00 committed by GitHub
parent 90189c71a9
commit d71af5f502
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 16 additions and 8 deletions

View File

@ -1178,7 +1178,7 @@ class FusedMoE(CustomOp):
hidden_size: Input hidden state size of the transformer
intermediate_size: Intermediate size of the experts
params_dtype: Data type for the parameters.
reduce_results: Whether to all all_reduce on the output of the layer
reduce_results: Whether to all_reduce on the output of the layer
renormalize: Whether to renormalize the logits in the fused_moe kernel
quant_config: Quantization configure.
enable_eplb: Whether to enable expert parallelism load balancer.

View File

@ -3,7 +3,10 @@
import torch
from vllm.distributed import tensor_model_parallel_all_reduce
from vllm.distributed import (
get_tensor_model_parallel_world_size,
tensor_model_parallel_all_reduce,
)
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
@ -25,16 +28,13 @@ class SharedFusedMoE(FusedMoE):
super().__init__(**kwargs)
self._shared_experts = shared_experts
# Disable shared expert overlap if EP is disabled or we are not using
# Disable shared expert overlap if we are not using
# flashinfer + DP since there is nothing to be gained in this case.
# Disabling the overlap optimization also prevents the shared experts
# from being hidden from torch.compile.
self.use_overlapped = (
use_overlapped
and not (
self.use_ep
or (self.use_flashinfer_cutlass_kernels and self.dp_size > 1)
)
and not (self.use_flashinfer_cutlass_kernels and self.dp_size > 1)
and self._shared_experts is not None
)
@ -65,7 +65,7 @@ class SharedFusedMoE(FusedMoE):
# should have been created with reduce_results=False.
if (
self.reduce_results
and self.tp_size > 1
and get_tensor_model_parallel_world_size() > 1
and self.must_reduce_shared_expert_outputs()
):
shared_out = tensor_model_parallel_all_reduce(shared_out)
@ -81,4 +81,12 @@ class SharedFusedMoE(FusedMoE):
hidden_states=hidden_states,
router_logits=router_logits,
)
# ensure early TP reduction of shared expert outputs when required
if (
shared_out is not None
and self.reduce_results
and get_tensor_model_parallel_world_size() > 1
and self.must_reduce_shared_expert_outputs()
):
shared_out = tensor_model_parallel_all_reduce(shared_out)
return shared_out, fused_out