[Quant] Aria SupportsQuant (#13416)

2025-12-10 13:44:58 +08:00 · 2025-02-18 00:51:09 -05:00 · 2025-02-18 00:51:09 -05:00 · d1b649f1ef
commit d1b649f1ef
parent ac19b519ed
1 changed files with 10 additions and 3 deletions
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@ -36,7 +36,7 @@ from .idefics2_vision_model import Idefics2VisionConfig
 from .idefics2_vision_model import (
    Idefics2VisionTransformer as Idefics3VisionTransformer)
 # yapf: enable
-from .interfaces import SupportsMultiModal
+from .interfaces import SupportsMultiModal, SupportsQuant
 from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                    is_pp_missing_parameter, maybe_prefix,
@ -53,7 +53,8 @@ class AriaImagePixelInputs(TypedDict):
    """


-class AriaVisionTransformer(Idefics3VisionTransformer):
+class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}

    def __init__(
        self,
@ -304,11 +305,17 @@ class AriaTextDecoderLayer(LlamaDecoderLayer):
        self.mlp = AriaTextMoELayer(config, quant_config=quant_config)


-class AriaTextModel(LlamaModel):
+class AriaTextModel(LlamaModel, SupportsQuant):
    """
    Custom LlamaModel for the AriaMoE model which modifies the standard
    LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
    """
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts.w13_weight": ["experts.fc1.weight"],
+        "experts.w2_weight": ["experts.fc2.weight"],
+    }

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config,