diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 4a3fc2a1a6b9c..d905cc9eb0eff 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -57,9 +57,10 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): "input_activations") if quant_config._is_wNa16_group_channel(weight_quant, input_quant): + # group_size=None means channelwise + group_size = weight_quant.group_size or -1 # Prefer to use the MarlinMoE kernel when it is supported. - if not check_moe_marlin_supports_layer(layer, - weight_quant.group_size): + if not check_moe_marlin_supports_layer(layer, group_size): if (weight_quant.strategy in QuantizationStrategy.GROUP and weight_quant.actorder in (ActivationOrdering.GROUP, ActivationOrdering.DYNAMIC)): diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 56aafca87e9e6..1c60d0f640e37 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -610,9 +610,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): activation: str = "silu", ) -> torch.Tensor: assert activation == "silu", "Only SiLU activation is supported." - if apply_router_weight_on_input is not None: + if apply_router_weight_on_input: raise NotImplementedError( - "Apply router weight on input is not supported for" + "Apply router weight on input is not supported for " "fused Marlin MoE method.") topk_weights, topk_ids = FusedMoE.select_experts(