From 6b2427f995a81377e5758b34c8bb4c66db2f67bc Mon Sep 17 00:00:00 2001 From: Chen Xia Date: Wed, 23 Apr 2025 17:32:40 -0700 Subject: [PATCH] [Quantization]add prefix for commandA quantized model (#17017) --- vllm/model_executor/models/commandr.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index bb8d9bf8a03c..8912affe36fe 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -89,6 +89,7 @@ class CohereMLP(nn.Module): self, config: CohereConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -99,12 +100,14 @@ class CohereMLP(nn.Module): [self.intermediate_size] * 2, bias=False, quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", ) self.down_proj = RowParallelLinear( self.intermediate_size, self.hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.down_proj", ) self.act_fn = SiluAndMul() @@ -158,12 +161,14 @@ class CohereAttention(nn.Module): self.total_num_kv_heads, bias=False, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, self.hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) self.rotary_emb = get_rope( self.head_dim, @@ -244,7 +249,9 @@ class CohereDecoderLayer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.self_attn") - self.mlp = CohereMLP(config, quant_config=quant_config) + self.mlp = CohereMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.input_layernorm = LayerNorm(param_shape=(config.hidden_size), eps=config.layer_norm_eps)