From d256cd23c125ccb6c2486e8b14e7999e51c68fc2 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 2 Sep 2025 09:24:24 -0400
Subject: [PATCH] activation plumbing for fused_marlin_moe

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/awq_marlin.py |  3 +--
 .../compressed_tensors/compressed_tensors_moe.py      | 11 +++++------
 vllm/model_executor/layers/quantization/fp8.py        |  3 +--
 .../model_executor/layers/quantization/gptq_marlin.py |  3 +--
 vllm/model_executor/layers/quantization/modelopt.py   |  5 ++++-
 5 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 8293d42ef4556..5fcfbe7ce4f40 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -512,8 +512,6 @@ class AWQMoEMethod(FusedMoEMethodBase):
             raise NotImplementedError(
                 "EPLB not supported for `AWQMoEMethod` yet.")
 
-        assert activation == "silu", "Only SiLU activation is supported."
-
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -542,6 +540,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
+            activation=activation,
             expert_map=expert_map,
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index e4585419226cd..1deb019298d0e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -364,7 +364,6 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for "
                                       "`CompressedTensorsW4A4MoeMethod` yet.")
-        assert activation == "silu", "Only SiLU activation is supported."
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -398,8 +397,11 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
                 quant_type_id=scalar_types.float4_e2m1f.id,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
+                activation=activation,
                 expert_map=expert_map)
 
+        assert activation == "silu", "Only SiLU activation is supported."
+
         # FlashInfer fused experts path
         if self.fused_experts is not None:
             assert is_valid_flashinfer_cutlass_fused_moe(
@@ -924,8 +926,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 a2_scale=layer.w2_input_scale,
                 expert_map=expert_map)
         if self.use_marlin:
-            assert activation == "silu", (
-                f"{activation} not supported for Marlin MoE.")
             return torch.ops.vllm.fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -940,6 +940,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                 quant_type_id=scalar_types.float8_e4m3fn.id,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
+                activation=activation,
                 expert_map=expert_map)
 
         assert self.fused_experts_func is not None
@@ -1383,9 +1384,6 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
                 "EPLB not supported for "
                 "`CompressedTensorsWNA16MarlinMoEMethod` yet.")
 
-        assert activation == "silu", (
-            f"{activation} not supported for Marlin MoE.")
-
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -1414,6 +1412,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
+            activation=activation,
             expert_map=expert_map,
             g_idx1=layer.w13_weight_g_idx,
             g_idx2=layer.w2_weight_g_idx,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d9e01dcf40d5a..c0321082070f7 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1077,8 +1077,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 block_shape=self.quant_config.weight_block_size,
                 expert_map=expert_map)
         elif self.use_marlin:
-            assert activation == "silu", (
-                f"{activation} not supported for Marlin MoE.")
             return torch.ops.vllm.fused_marlin_moe(
                 x,
                 layer.w13_weight,
@@ -1093,6 +1091,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 quant_type_id=scalar_types.float8_e4m3fn.id,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
+                activation=activation,
                 expert_map=expert_map)
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
             assert self.block_quant is None
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 3644d91f64e3c..d42073c0d47bf 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -661,8 +661,6 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             raise NotImplementedError(
                 "EPLB not supported for `GPTQMarlinMoEMethod` yet.")
 
-        assert activation == "silu", "Only SiLU activation is supported."
-
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
@@ -691,6 +689,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
+            activation=activation,
             expert_map=expert_map,
             g_idx1=layer.w13_g_idx,
             g_idx2=layer.w2_g_idx,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 4bb8438d90844..27aba609ce39e 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1370,13 +1370,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptNvFp4FusedMoE` yet.")
-        assert activation == "silu", "Only SiLU activation is supported."
 
         if self.allow_flashinfer and \
             self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
             import flashinfer
 
             from vllm.model_executor.models.llama4 import Llama4MoE
+            assert activation == "silu", "Only SiLU activation is supported."
 
             a1_gscale = layer.w13_input_scale_quant
             (hidden_states_fp4,
@@ -1458,8 +1458,11 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 quant_type_id=scalar_types.float4_e2m1f.id,
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
+                activation=activation,
                 expert_map=expert_map)
 
+        assert activation == "silu", "Only SiLU activation is supported."
+
         if self.fused_experts is not None:
             assert self.allow_flashinfer and \
                self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS