mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-25 09:04:28 +08:00
[Bugfix] Fixes for new marlin moe usage (#18017)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
d8487ef557
commit
1df491c522
@ -57,9 +57,10 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
|
||||
"input_activations")
|
||||
|
||||
if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
|
||||
# group_size=None means channelwise
|
||||
group_size = weight_quant.group_size or -1
|
||||
# Prefer to use the MarlinMoE kernel when it is supported.
|
||||
if not check_moe_marlin_supports_layer(layer,
|
||||
weight_quant.group_size):
|
||||
if not check_moe_marlin_supports_layer(layer, group_size):
|
||||
if (weight_quant.strategy in QuantizationStrategy.GROUP and
|
||||
weight_quant.actorder in (ActivationOrdering.GROUP,
|
||||
ActivationOrdering.DYNAMIC)):
|
||||
|
||||
@ -610,9 +610,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
activation: str = "silu",
|
||||
) -> torch.Tensor:
|
||||
assert activation == "silu", "Only SiLU activation is supported."
|
||||
if apply_router_weight_on_input is not None:
|
||||
if apply_router_weight_on_input:
|
||||
raise NotImplementedError(
|
||||
"Apply router weight on input is not supported for"
|
||||
"Apply router weight on input is not supported for "
|
||||
"fused Marlin MoE method.")
|
||||
|
||||
topk_weights, topk_ids = FusedMoE.select_experts(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user