From 1df491c522c92c3b15dea4a4cd92c437bbda9f3f Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 12 May 2025 23:50:04 -0400 Subject: [PATCH] [Bugfix] Fixes for new marlin moe usage (#18017) Signed-off-by: mgoin --- .../compressed_tensors/compressed_tensors_moe.py | 5 +++-- vllm/model_executor/layers/quantization/gptq_marlin.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 4a3fc2a1a6b9c..d905cc9eb0eff 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -57,9 +57,10 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): "input_activations") if quant_config._is_wNa16_group_channel(weight_quant, input_quant): + # group_size=None means channelwise + group_size = weight_quant.group_size or -1 # Prefer to use the MarlinMoE kernel when it is supported. - if not check_moe_marlin_supports_layer(layer, - weight_quant.group_size): + if not check_moe_marlin_supports_layer(layer, group_size): if (weight_quant.strategy in QuantizationStrategy.GROUP and weight_quant.actorder in (ActivationOrdering.GROUP, ActivationOrdering.DYNAMIC)): diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 56aafca87e9e6..1c60d0f640e37 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -610,9 +610,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): activation: str = "silu", ) -> torch.Tensor: assert activation == "silu", "Only SiLU activation is supported." - if apply_router_weight_on_input is not None: + if apply_router_weight_on_input: raise NotImplementedError( - "Apply router weight on input is not supported for" + "Apply router weight on input is not supported for " "fused Marlin MoE method.") topk_weights, topk_ids = FusedMoE.select_experts(