From be4445072c4e1e5e3a2ebf0552e432fc86f137ca Mon Sep 17 00:00:00 2001 From: Zebing Lin Date: Tue, 21 Oct 2025 02:19:00 -0400 Subject: [PATCH] [Fix][Spec Decode] Fix llama4 draft loading with different quantization (#27136) Signed-off-by: linzebing --- vllm/model_executor/models/llama4_eagle.py | 27 ++++++++++++++-------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index dd6337244ca6..90273463d64e 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -60,16 +60,23 @@ class LlamaModel(nn.Module): prefix=maybe_prefix(prefix, "embed_tokens"), ) - self.layers = nn.ModuleList( - [ - Llama4DecoderLayer( - vllm_config=vllm_config, - prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"), - config=self.config, - ) - for i in range(self.config.num_hidden_layers) - ] - ) + # Temporarily modify vllm_config.quant_config for draft model layers + original_quant_config = vllm_config.quant_config + vllm_config.quant_config = quant_config + try: + self.layers = nn.ModuleList( + [ + Llama4DecoderLayer( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"), + config=self.config, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + finally: + # Restore original quant_config + vllm_config.quant_config = original_quant_config self.fc = torch.nn.Linear( self.config.hidden_size * 2, self.config.hidden_size, bias=False )