Fix for attention layers to remain unquantized during moe_wn16 quant (#12570)

Fix to AWQ quant loading of the new R1 model The new optimized MoE kernels for a large number of experts `moe_wn16` uses AWQ quant which requires the attention layers to be in 16bit The current merge has broken this, and the `get_quant_method` must return None for it to work correctly again --------- Signed-off-by: Srikanth Srinivas <srikanth@astrum.ai> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Beim <beim2015@outlook.com> Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Signed-off-by: mgoin <michael@neuralmagic.com> Signed-off-by: npanpaliya <nishidha.panpaliya@partner.ibm.com> Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: simon-mo <xmo@berkeley.edu> Signed-off-by: Cody Yu <hao.yu.cody@gmail.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Ryan N <ryan.nguyen@centml.ai> Signed-off-by: Brian Dellabetta <bdellabe@redhat.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: Rahul Tuli <rahul@neuralmagic.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: simon-mo <simon.mo@hey.com> Signed-off-by: Vicente Herrera <vicenteherrera@vicenteherrera.com> Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Shawn Du <shawnd200@outlook.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Beim <805908499@qq.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Nishidha <nishidha.panpaliya@partner.ibm.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Kevin H. Luu <kevin@anyscale.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Ryan Nguyen <96593302+xpbowler@users.noreply.github.com> Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com> Co-authored-by: fade_away <1028552010@qq.com> Co-authored-by: weilong.yu <weilong.yu@shopee.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Eldar Kurtic <eldarkurtic314@gmail.com> Co-authored-by: Rahul Tuli <rahul@neuralmagic.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Vicente Herrera <vicenteherrera@vicenteherrera.com> Co-authored-by: Jinzhen Lin <linjinzhen@hotmail.com> Co-authored-by: Shawn Du <shawnd200@outlook.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-12-17 07:25:01 +08:00 · 2025-02-02 21:46:19 -08:00 · 2025-02-02 21:46:19 -08:00 · b9986454fe
commit b9986454fe
parent c5932e5dac
1 changed files with 6 additions and 4 deletions
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@ -7,7 +7,8 @@ import torch
 from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
 from vllm.model_executor.layers.fused_moe.layer import (
    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.linear import (LinearBase,
                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.base_config import (
@ -125,9 +126,7 @@ class MoeWNA16Config(QuantizationConfig):
                         prefix: str) -> Optional["QuantizeMethodBase"]:
        if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
            return UnquantizedLinearMethod()
-        elif isinstance(layer, FusedMoE):
+        elif isinstance(layer, LinearBase):
            return MoeWNA16Method(self)
        else:
            if self.linear_quant_method == "gptq":
                if self.use_marlin:
                    return GPTQMarlinConfig.from_config(
@ -144,6 +143,9 @@ class MoeWNA16Config(QuantizationConfig):
                        self.full_config).get_quant_method(layer, prefix)
            else:
                raise ValueError("moe_wna16 only support gptq and awq.")
        elif isinstance(layer, FusedMoE):
            return MoeWNA16Method(self)
        return None
 def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]):