diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index 60aa1c088b4d8..a143347b19f2c 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -29,14 +29,14 @@ class SharedFusedMoE(FusedMoE): self._shared_experts = shared_experts # Disable shared expert overlap if: - # - we are using eplb, because of correctness issues - # - we are using flashinfer with DP, since there nothing to gain + # - we are using eplb with non-default backend, because of correctness issues + # - we are using flashinfer with DP, since there nothint to gain # - we are using marlin kernels + backend = self.moe_parallel_config.all2all_backend self.use_overlapped = ( use_overlapped and not ( - # TODO(wentao): find the root cause and remove this condition - self.enable_eplb + (self.enable_eplb and backend != "allgather_reducescatter") or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1) ) and self._shared_experts is not None