[Bugfix] Disable shared expert overlap if Marlin MoE is used (#28410)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-03-19 08:07:07 +08:00 · 2025-11-11 18:16:12 -05:00 · 2025-11-11 18:16:12 -05:00 · e5f599d4d1
commit e5f599d4d1
parent 28534b92b9
6 changed files with 13 additions and 5 deletions
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -678,6 +678,10 @@ class FusedMoE(CustomOp):
            and self.moe_config.use_flashinfer_cutlass_kernels
        )

+    @property
+    def use_marlin_kernels(self):
+        return getattr(self.quant_method, "use_marlin", False)
+
    @property
    def use_dp_chunking(self) -> bool:
        return (
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@ -28,17 +28,17 @@ class SharedFusedMoE(FusedMoE):
        super().__init__(**kwargs)
        self._shared_experts = shared_experts

-        # Disable shared expert overlap if we are using eplb, because of
-        # correctness issues, or if using flashinfer with DP, since there
-        # is nothing to be gained in this case. Disabling the overlap
-        # optimization also prevents the shared experts from being hidden
-        # from torch.compile.
+        # Disable shared expert overlap if:
+        #   - we are using eplb, because of correctness issues
+        #   - we are using flashinfer with DP, since there nothint to gain
+        #   - we are using marlin kjernels
        self.use_overlapped = (
            use_overlapped
            and not (
                # TODO(wentao): find the root cause and remove this condition
                self.enable_eplb
                or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
+                or self.use_marlin_kernels
            )
            and self._shared_experts is not None
        )
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@ -424,6 +424,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
        if self.quant_config.weight_bits != 4:
            raise ValueError("AWQMoEMethod only supports 4bit now.")
        self.quant_type = scalar_types.uint4
+        self.use_marlin = True

    def create_weights(
        self,
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@ -1342,6 +1342,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
                f"{WNA16_SUPPORTED_BITS}",
            )
        self.quant_type = WNA16_SUPPORTED_TYPES_MAP[self.num_bits]
+        self.use_marlin = True

    def create_weights(
        self,
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@ -482,6 +482,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
            self.quant_type = scalar_types.uint8b128
        else:
            raise ValueError("GPTQMarlinMoEMethod only supports int4 and int8 now.")
+        self.use_marlin = True

    def create_weights(
        self,
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@ -216,6 +216,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
    def __init__(self, moe: FusedMoEConfig):
        super().__init__(moe)
        self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+        self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN
        self.max_capture_size = (
            get_current_vllm_config().compilation_config.max_cudagraph_capture_size
        )