[V1][Spec Decode] Make Eagle model arch config driven (#17323)

This commit is contained in:
Ekagra Ranjan 2025-04-28 22:22:02 -04:00 committed by GitHub
parent 86d9fc29cb
commit e136000595
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 26 additions and 13 deletions

View File

@ -2401,7 +2401,8 @@ class SpeculativeConfig:
pass
else:
eagle_config = EAGLEConfig(
self.draft_model_config.hf_config)
self.draft_model_config.hf_config,
method=self.method)
self.draft_model_config.hf_config = eagle_config
if (self.num_speculative_tokens is not None

View File

@ -15,6 +15,7 @@ class EAGLEConfig(PretrainedConfig):
def __init__(self,
model: Union[PretrainedConfig, dict, None] = None,
truncated_vocab_size: Optional[int] = None,
method: Optional[str] = 'eagle',
**kwargs):
model_config: Union[PretrainedConfig, DeepseekV2Config, None]
@ -45,7 +46,23 @@ class EAGLEConfig(PretrainedConfig):
if not envs.VLLM_USE_V1:
kwargs["architectures"] = ["EAGLEModel"]
else:
kwargs["architectures"] = ["EagleLlamaForCausalLM"]
# Eagle model name should follow naming convention of
# LlamaForCausalLM -> EagleLlamaForCausalLM
if method == "eagle":
assert self.model is not None, \
"model should not be None when method is eagle"
kwargs["architectures"] = [
f"Eagle{arch}" for arch in self.model.architectures
]
elif method == "eagle3":
assert self.model is not None, \
"model should not be None when method is eagle3"
kwargs["architectures"] = [
f"Eagle3{arch}" for arch in self.model.architectures
]
else:
raise ValueError(f"Invalid method {method}. \
Supported methods are eagle and eagle3.")
super().__init__(**kwargs)

View File

@ -9,8 +9,7 @@ from vllm.forward_context import set_forward_context
from vllm.logger import init_logger
from vllm.model_executor.model_loader.loader import get_model_loader
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.model_executor.models.llama_eagle import EagleLlamaForCausalLM
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
from vllm.model_executor.models import ModelRegistry
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
from vllm.v1.sample.metadata import SamplingMetadata
@ -225,15 +224,11 @@ class EagleProposer:
with set_default_torch_dtype(
draft_model_config.dtype), set_current_vllm_config(
self.vllm_config):
if self.vllm_config.speculative_config.method == "eagle":
self.model = EagleLlamaForCausalLM(
model_config=draft_model_config,
start_layer_id=target_layer_num).to(target_device)
else:
assert self.vllm_config.speculative_config.method == "eagle3"
self.model = Eagle3LlamaForCausalLM(
model_config=draft_model_config,
start_layer_id=target_layer_num).to(target_device)
draft_model_cls, arch = ModelRegistry.resolve_model_cls(
draft_model_config.architectures)
self.model = draft_model_cls(
model_config=draft_model_config,
start_layer_id=target_layer_num).to(target_device)
loaded_weights = self.model.load_weights(
loader.get_all_weights(