mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-22 15:37:54 +08:00
[Bugfix] Fix marlin moe fallback logic for llama4 (#18042)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
2ff297dce9
commit
ea6ae8cb45
@ -4,4 +4,5 @@ compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
|
|||||||
compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
|
compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
|
||||||
gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
|
gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
|
||||||
gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
|
gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
|
||||||
awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
|
awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
|
||||||
|
compressed-tensors, RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16, main
|
||||||
@ -480,6 +480,7 @@ class FusedMoE(torch.nn.Module):
|
|||||||
self.custom_routing_function = custom_routing_function
|
self.custom_routing_function = custom_routing_function
|
||||||
self.scoring_func = scoring_func
|
self.scoring_func = scoring_func
|
||||||
self.e_score_correction_bias = e_score_correction_bias
|
self.e_score_correction_bias = e_score_correction_bias
|
||||||
|
self.apply_router_weight_on_input = apply_router_weight_on_input
|
||||||
self.activation = activation
|
self.activation = activation
|
||||||
|
|
||||||
if self.scoring_func != "softmax" and not self.use_grouped_topk:
|
if self.scoring_func != "softmax" and not self.use_grouped_topk:
|
||||||
@ -498,7 +499,6 @@ class FusedMoE(torch.nn.Module):
|
|||||||
self.quant_method = quant_config.get_quant_method(self, prefix)
|
self.quant_method = quant_config.get_quant_method(self, prefix)
|
||||||
assert self.quant_method is not None
|
assert self.quant_method is not None
|
||||||
|
|
||||||
self.apply_router_weight_on_input = apply_router_weight_on_input
|
|
||||||
moe_quant_params = {
|
moe_quant_params = {
|
||||||
"num_experts": self.local_num_experts,
|
"num_experts": self.local_num_experts,
|
||||||
"hidden_size": hidden_size,
|
"hidden_size": hidden_size,
|
||||||
|
|||||||
@ -171,13 +171,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
|
|||||||
-> bool:
|
-> bool:
|
||||||
hidden_size = layer.hidden_size
|
hidden_size = layer.hidden_size
|
||||||
intermediate_size_per_partition = layer.intermediate_size_per_partition
|
intermediate_size_per_partition = layer.intermediate_size_per_partition
|
||||||
|
# apply_router_weight_on_input is not supported for moe marlin
|
||||||
|
supports_router_weight = not layer.apply_router_weight_on_input
|
||||||
|
# moe marlin requires the activation to be silu
|
||||||
|
supports_activation = layer.activation == "silu"
|
||||||
|
|
||||||
# gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
|
# gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
|
||||||
# down: (n, k) = (hidden_size, intermediate_size_per_partition)
|
# down: (n, k) = (hidden_size, intermediate_size_per_partition)
|
||||||
# moe marlin requires n % 128 == 0 and k % 64 == 0
|
# moe marlin requires n % 128 == 0 and k % 64 == 0
|
||||||
return hidden_size % 128 == 0 and \
|
supports_shape = hidden_size % 128 == 0 and \
|
||||||
intermediate_size_per_partition % max(64, group_size) == 0 and \
|
intermediate_size_per_partition % max(64, group_size) == 0
|
||||||
group_size in [-1, 32, 64, 128]
|
supports_group_size = group_size in [-1, 32, 64, 128]
|
||||||
|
return supports_shape and supports_group_size and \
|
||||||
|
supports_router_weight and supports_activation
|
||||||
|
|
||||||
|
|
||||||
def marlin_make_workspace(output_size_per_partition: int,
|
def marlin_make_workspace(output_size_per_partition: int,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user