From 1df491c522c92c3b15dea4a4cd92c437bbda9f3f Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 12 May 2025 23:50:04 -0400
Subject: [PATCH] [Bugfix] Fixes for new marlin moe usage (#18017)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../compressed_tensors/compressed_tensors_moe.py             | 5 +++--
 vllm/model_executor/layers/quantization/gptq_marlin.py       | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 4a3fc2a1a6b9c..d905cc9eb0eff 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -57,9 +57,10 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
             "input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
+            # group_size=None means channelwise
+            group_size = weight_quant.group_size or -1
             # Prefer to use the MarlinMoE kernel when it is supported.
-            if not check_moe_marlin_supports_layer(layer,
-                                                   weight_quant.group_size):
+            if not check_moe_marlin_supports_layer(layer, group_size):
                 if (weight_quant.strategy in QuantizationStrategy.GROUP and
                         weight_quant.actorder in (ActivationOrdering.GROUP,
                                                   ActivationOrdering.DYNAMIC)):
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 56aafca87e9e6..1c60d0f640e37 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -610,9 +610,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
-        if apply_router_weight_on_input is not None:
+        if apply_router_weight_on_input:
             raise NotImplementedError(
-                "Apply router weight on input is not supported for"
+                "Apply router weight on input is not supported for "
                 "fused Marlin MoE method.")
 
         topk_weights, topk_ids = FusedMoE.select_experts(