From 1aaecda07809d29e1559791fdb61880d4125c332 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 8 Nov 2025 08:33:11 +0800 Subject: [PATCH] [XPU] Enable Expert parallel for MoE models (#28263) Signed-off-by: Yan Ma Signed-off-by: Kunshang Ji --- vllm/model_executor/layers/fused_moe/layer.py | 2 ++ vllm/model_executor/layers/quantization/ipex_quant.py | 2 ++ vllm/model_executor/layers/quantization/mxfp4.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 1236116386c9..e69ead074c50 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -642,10 +642,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): if current_platform.is_xpu(): import intel_extension_for_pytorch as ipex + ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( layer.w13_weight, layer.w2_weight, use_prepack=True, + experts_start_id=ep_rank_start, ) elif current_platform.is_cpu(): from vllm.model_executor.layers.fused_moe import cpu_fused_moe diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 5b3aabfde0c1..e0234191c62b 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -399,6 +399,7 @@ class XPUFp8MoEMethod(FusedMoEMethodBase): layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) import intel_extension_for_pytorch as ipex + ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( layer.w13_weight, layer.w2_weight, @@ -407,6 +408,7 @@ class XPUFp8MoEMethod(FusedMoEMethodBase): a1_scale_inv=layer.w13_input_scale, a2_scale_inv=layer.w2_input_scale, use_prepack=True, + experts_start_id=ep_rank_start, ) def get_fused_moe_quant_config( diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 28dba091f430..e339f15510d7 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1113,6 +1113,7 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod): layer.w13_weight.data = layer.w13_weight.data.view(torch.int32) layer.w2_weight.data = layer.w2_weight.data.view(torch.int32) + ep_rank_start = self.moe_config.ep_rank * self.moe_config.num_local_experts layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( layer.w13_weight, layer.w2_weight, @@ -1121,6 +1122,7 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod): w13_bias=layer.w13_bias, w2_bias=layer.w2_bias, is_mxfp4=True, + experts_start_id=ep_rank_start, ) def apply(