From ea6ae8cb45f7c1f8bbe76e1166893adae43881ae Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 13 May 2025 03:53:28 -0400 Subject: [PATCH] [Bugfix] Fix marlin moe fallback logic for llama4 (#18042) Signed-off-by: mgoin --- tests/weight_loading/models-large.txt | 3 ++- vllm/model_executor/layers/fused_moe/layer.py | 2 +- .../layers/quantization/utils/marlin_utils.py | 12 +++++++++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/weight_loading/models-large.txt b/tests/weight_loading/models-large.txt index 9c1c11da572ea..ee98aed2684d1 100644 --- a/tests/weight_loading/models-large.txt +++ b/tests/weight_loading/models-large.txt @@ -4,4 +4,5 @@ compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True -awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main \ No newline at end of file +awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main +compressed-tensors, RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16, main \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5337ff0037da4..6a3d00acd258f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -480,6 +480,7 @@ class FusedMoE(torch.nn.Module): self.custom_routing_function = custom_routing_function self.scoring_func = scoring_func self.e_score_correction_bias = e_score_correction_bias + self.apply_router_weight_on_input = apply_router_weight_on_input self.activation = activation if self.scoring_func != "softmax" and not self.use_grouped_topk: @@ -498,7 +499,6 @@ class FusedMoE(torch.nn.Module): self.quant_method = quant_config.get_quant_method(self, prefix) assert self.quant_method is not None - self.apply_router_weight_on_input = apply_router_weight_on_input moe_quant_params = { "num_experts": self.local_num_experts, "hidden_size": hidden_size, diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 89268ef7a38ba..11efd5802e24e 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -171,13 +171,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \ -> bool: hidden_size = layer.hidden_size intermediate_size_per_partition = layer.intermediate_size_per_partition + # apply_router_weight_on_input is not supported for moe marlin + supports_router_weight = not layer.apply_router_weight_on_input + # moe marlin requires the activation to be silu + supports_activation = layer.activation == "silu" # gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size) # down: (n, k) = (hidden_size, intermediate_size_per_partition) # moe marlin requires n % 128 == 0 and k % 64 == 0 - return hidden_size % 128 == 0 and \ - intermediate_size_per_partition % max(64, group_size) == 0 and \ - group_size in [-1, 32, 64, 128] + supports_shape = hidden_size % 128 == 0 and \ + intermediate_size_per_partition % max(64, group_size) == 0 + supports_group_size = group_size in [-1, 32, 64, 128] + return supports_shape and supports_group_size and \ + supports_router_weight and supports_activation def marlin_make_workspace(output_size_per_partition: int,