mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 04:54:56 +08:00
[XPU] Enable Expert parallel for MoE models (#28263)
Signed-off-by: Yan Ma <yan.ma@intel.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
parent
811df41ee9
commit
1aaecda078
@ -642,10 +642,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
if current_platform.is_xpu():
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
|
||||
layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
|
||||
layer.w13_weight,
|
||||
layer.w2_weight,
|
||||
use_prepack=True,
|
||||
experts_start_id=ep_rank_start,
|
||||
)
|
||||
elif current_platform.is_cpu():
|
||||
from vllm.model_executor.layers.fused_moe import cpu_fused_moe
|
||||
|
||||
@ -399,6 +399,7 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
|
||||
layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
|
||||
layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
|
||||
layer.w13_weight,
|
||||
layer.w2_weight,
|
||||
@ -407,6 +408,7 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
|
||||
a1_scale_inv=layer.w13_input_scale,
|
||||
a2_scale_inv=layer.w2_input_scale,
|
||||
use_prepack=True,
|
||||
experts_start_id=ep_rank_start,
|
||||
)
|
||||
|
||||
def get_fused_moe_quant_config(
|
||||
|
||||
@ -1113,6 +1113,7 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod):
|
||||
|
||||
layer.w13_weight.data = layer.w13_weight.data.view(torch.int32)
|
||||
layer.w2_weight.data = layer.w2_weight.data.view(torch.int32)
|
||||
ep_rank_start = self.moe_config.ep_rank * self.moe_config.num_local_experts
|
||||
layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
|
||||
layer.w13_weight,
|
||||
layer.w2_weight,
|
||||
@ -1121,6 +1122,7 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod):
|
||||
w13_bias=layer.w13_bias,
|
||||
w2_bias=layer.w2_bias,
|
||||
is_mxfp4=True,
|
||||
experts_start_id=ep_rank_start,
|
||||
)
|
||||
|
||||
def apply(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user