From 6ecc1e411ba3e720ef85aa34bba338581bcb7f76 Mon Sep 17 00:00:00 2001 From: tjp_zju Date: Sun, 14 Dec 2025 18:20:51 +0800 Subject: [PATCH] =?UTF-8?q?[Bugfix]=20fix=20=5Fget=5Fquant=5Fmethod=20of?= =?UTF-8?q?=20FusedMoE=20for=20deepseekV3.2=20on=20non-NV=E2=80=A6=20(#300?= =?UTF-8?q?57)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: tjp_zju --- vllm/model_executor/layers/quantization/moe_wna16.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 0131a330f70d2..4bedb951a33f5 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -17,6 +17,9 @@ from vllm.model_executor.layers.fused_moe.layer import ( FusedMoEMethodBase, FusedMoeWeightScaleSupported, ) +from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( + UnquantizedFusedMoEMethod, +) from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( @@ -162,6 +165,8 @@ class MoeWNA16Config(QuantizationConfig): self, layer: torch.nn.Module, prefix: str ) -> Optional["QuantizeMethodBase"]: if is_layer_skipped_quant(prefix, self.modules_to_not_convert): + if isinstance(layer, FusedMoE): + return UnquantizedFusedMoEMethod(layer.moe_config) return UnquantizedLinearMethod() elif isinstance(layer, LinearBase): # Avoid circular import