mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 05:04:58 +08:00
[Fix][Spec Decode] Fix llama4 draft loading with different quantization (#27136)
Signed-off-by: linzebing <linzebing1995@gmail.com>
This commit is contained in:
parent
f381cf2302
commit
be4445072c
@ -60,16 +60,23 @@ class LlamaModel(nn.Module):
|
||||
prefix=maybe_prefix(prefix, "embed_tokens"),
|
||||
)
|
||||
|
||||
self.layers = nn.ModuleList(
|
||||
[
|
||||
Llama4DecoderLayer(
|
||||
vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
|
||||
config=self.config,
|
||||
)
|
||||
for i in range(self.config.num_hidden_layers)
|
||||
]
|
||||
)
|
||||
# Temporarily modify vllm_config.quant_config for draft model layers
|
||||
original_quant_config = vllm_config.quant_config
|
||||
vllm_config.quant_config = quant_config
|
||||
try:
|
||||
self.layers = nn.ModuleList(
|
||||
[
|
||||
Llama4DecoderLayer(
|
||||
vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
|
||||
config=self.config,
|
||||
)
|
||||
for i in range(self.config.num_hidden_layers)
|
||||
]
|
||||
)
|
||||
finally:
|
||||
# Restore original quant_config
|
||||
vllm_config.quant_config = original_quant_config
|
||||
self.fc = torch.nn.Linear(
|
||||
self.config.hidden_size * 2, self.config.hidden_size, bias=False
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user