mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 18:25:40 +08:00
[TPU][Bugfix] fix moe layer (#21340)
Signed-off-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Simon Mo <simon.mo@hey.com>
This commit is contained in:
parent
90eeea8f85
commit
e74bfc70e4
@ -18,6 +18,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"Qwen/Qwen2.5-1.5B-Instruct",
|
"Qwen/Qwen2.5-1.5B-Instruct",
|
||||||
|
"Qwen/Qwen1.5-MoE-A2.7B",
|
||||||
# TODO: Enable this models with v6e
|
# TODO: Enable this models with v6e
|
||||||
# "Qwen/Qwen2-7B-Instruct",
|
# "Qwen/Qwen2-7B-Instruct",
|
||||||
# "meta-llama/Llama-3.1-8B",
|
# "meta-llama/Llama-3.1-8B",
|
||||||
|
|||||||
@ -481,8 +481,16 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|||||||
e_score_correction_bias: Optional[torch.Tensor] = None,
|
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||||
apply_router_weight_on_input: bool = False,
|
apply_router_weight_on_input: bool = False,
|
||||||
activation: str = "silu",
|
activation: str = "silu",
|
||||||
**kwargs,
|
enable_eplb: bool = False,
|
||||||
|
expert_load_view: Optional[torch.Tensor] = None,
|
||||||
|
logical_to_physical_map: Optional[torch.Tensor] = None,
|
||||||
|
logical_replica_count: Optional[torch.Tensor] = None,
|
||||||
):
|
):
|
||||||
|
if enable_eplb is not False or expert_load_view is not None or \
|
||||||
|
logical_to_physical_map is not None or \
|
||||||
|
logical_replica_count is not None:
|
||||||
|
raise NotImplementedError("Expert load balancing is not supported "
|
||||||
|
"for CPU.")
|
||||||
return layer.cpu_fused_moe(
|
return layer.cpu_fused_moe(
|
||||||
layer,
|
layer,
|
||||||
x,
|
x,
|
||||||
@ -518,6 +526,10 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|||||||
e_score_correction_bias: Optional[torch.Tensor] = None,
|
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||||
apply_router_weight_on_input: bool = False,
|
apply_router_weight_on_input: bool = False,
|
||||||
activation: str = "silu",
|
activation: str = "silu",
|
||||||
|
enable_eplb: bool = False,
|
||||||
|
expert_load_view: Optional[torch.Tensor] = None,
|
||||||
|
logical_to_physical_map: Optional[torch.Tensor] = None,
|
||||||
|
logical_replica_count: Optional[torch.Tensor] = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
assert not use_grouped_topk
|
assert not use_grouped_topk
|
||||||
assert num_expert_group is None
|
assert num_expert_group is None
|
||||||
@ -531,6 +543,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Expert score correction bias is not supported for TPU.")
|
"Expert score correction bias is not supported for TPU.")
|
||||||
assert activation == "silu", f"{activation} is not supported for TPU."
|
assert activation == "silu", f"{activation} is not supported for TPU."
|
||||||
|
if enable_eplb is not False or expert_load_view is not None or \
|
||||||
|
logical_to_physical_map is not None or \
|
||||||
|
logical_replica_count is not None:
|
||||||
|
raise NotImplementedError("Expert load balancing is not supported "
|
||||||
|
"for TPU.")
|
||||||
return fused_moe_pallas(hidden_states=x,
|
return fused_moe_pallas(hidden_states=x,
|
||||||
w1=layer.w13_weight,
|
w1=layer.w13_weight,
|
||||||
w2=layer.w2_weight,
|
w2=layer.w2_weight,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user