[Bugfix] Fix EAGLE vocab embedding construction for Llama 70B (#19033)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-12-10 23:35:52 +08:00 · 2025-06-05 22:10:08 -04:00 · 2025-06-05 22:10:08 -04:00 · 3465b87ef8
commit 3465b87ef8
parent c8134bea15
7 changed files with 70 additions and 47 deletions
--- a/benchmarks/kernels/bench_fp8_gemm.py
+++ b/benchmarks/kernels/bench_fp8_gemm.py
@ -5,11 +5,11 @@ import copy
 import itertools
 import torch
 import triton
 from weight_shapes import WEIGHT_SHAPES
 from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
 from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
 from vllm.triton_utils import triton
@triton.testing.perf_report(
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@ -9,6 +9,7 @@ import torch
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
                         VllmConfig)
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.v1.spec_decode.eagle import EagleProposer
 model_dir = "meta-llama/Llama-3.1-8B-Instruct"
@ -113,21 +114,26 @@ def test_prepare_inputs():
    assert torch.equal(token_indices, expected_token_indices)
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("method,proposer_helper", [
-    "method,proposer_helper,draft_model_dir,target_attribute_path", [
+    ("eagle", lambda k: _create_proposer("eagle", k)),
-        ("eagle", lambda k: _create_proposer("eagle", k), eagle_dir,
+    ("eagle3", lambda k: _create_proposer("eagle3", k)),
-         ('lm_head', )),
+])
-        ("eagle3", lambda k: _create_proposer("eagle3", k), eagle3_dir,
+@pytest.mark.parametrize("pp_size", [1, 2])
-         ('model', 'embed_tokens')),
+@pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
    ])
@mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
@mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
@mock.patch('vllm.v1.spec_decode.eagle.get_model')
 def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
-                    proposer_helper, draft_model_dir, target_attribute_path):
+                    proposer_helper, pp_size, use_distinct_embed_tokens):
-
+    # Setup draft model mock
    # Setup model mock
    mock_model = mock.MagicMock()
    if use_distinct_embed_tokens:
        # Some models can have a different hidden size than the target model,
        # so we test that their embed_tokens doesn't get overwritten
        mock_model.model.embed_tokens.weight.shape = (131072, 2048)
    else:
        mock_model.model.embed_tokens.weight.shape = (131072, 4096)
    mock_get_model.return_value = mock_model
    # Setup mocks for attention layers
@ -145,22 +151,24 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
    # Setup mock for pp group to return the appropriate value for world size
    mock_pp_group = mock.MagicMock()
-    mock_pp_group.world_size = 2 if method == "eagle" else 1
+    mock_pp_group.world_size = pp_size
    mock_get_pp_group.return_value = mock_pp_group
-    # Setup target model with the appropriate attributes
+    # Setup the target model mock with a custom class so that
-    target_model = mock.MagicMock()
+    # isinstance() checks match the expected type.
    class _TargetModelStub(LlamaForCausalLM):
        model: mock.MagicMock
        lm_head: mock.MagicMock
-    # Create the necessary attributes on the target model
+    target_model = mock.create_autospec(_TargetModelStub, instance=True)
-    current_obj = target_model
+    target_model.model = mock.MagicMock()
-    for i, attr in enumerate(target_attribute_path):
+    target_model.model.embed_tokens.weight.shape = (131072, 4096)
-        if i == len(target_attribute_path) - 1:
+
-            # Set the last attribute in the path to a MagicMock
+    from vllm.model_executor.models import SupportsMultiModal
-            setattr(current_obj, attr, mock.MagicMock())
+    assert not isinstance(target_model, SupportsMultiModal)
-        else:
+
-            # Create intermediate objects if needed
+    if method == "eagle":
-            setattr(current_obj, attr, mock.MagicMock())
+        target_model.lm_head = mock.MagicMock()
            current_obj = getattr(current_obj, attr)
    # Create proposer using the helper function
    proposer = proposer_helper(k=8)
@ -171,10 +179,18 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
    # Verify common interactions
    mock_get_model.assert_called_once()
-    # Verify the specific attribute sharing based on the method
+    # Verify that EAGLE models gain the lm head from the target model
    if method == "eagle":
        assert proposer.model.lm_head == target_model.lm_head
    # Verify that the embed tokens are set correctly
    # If pp_size is > 1, the embed tokens should be distinct
    if pp_size > 1 or use_distinct_embed_tokens:
        assert proposer.model.model.embed_tokens != \
            target_model.model.embed_tokens
    else:
        # When pp_size is 1 and the draft and target models have
        # embed_tokens of the same shape, they should be shared.
        assert proposer.model.model.embed_tokens == \
            target_model.model.embed_tokens
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@ -55,13 +55,11 @@ class LlamaModel(nn.Module):
            speculative_config.draft_model_config.hf_config
        self.vocab_size = self.config.vocab_size
-        # if PP disabled then draft will share embed with target
+        self.embed_tokens = VocabParallelEmbedding(
-        if get_pp_group().world_size > 1:
+            self.config.vocab_size,
-            self.embed_tokens = VocabParallelEmbedding(
+            self.config.hidden_size,
-                self.config.vocab_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
-                self.config.hidden_size,
+        )
                prefix=maybe_prefix(prefix, "embed_tokens"),
            )
        self.layers = nn.ModuleList([
            LlamaDecoderLayer(
@ -164,4 +162,4 @@ class EagleLlamaForCausalLM(LlamaForCausalLM):
            if "lm_head" not in name:
                name = "model." + name
            model_weights[name] = loaded_weight
-        return loader.load_weights(model_weights.items())
+        loader.load_weights(model_weights.items())
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@ -10,7 +10,6 @@ from transformers import LlamaConfig
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import QKVParallelLinear
@ -95,13 +94,11 @@ class LlamaModel(nn.Module):
            speculative_config.draft_model_config.hf_config
        self.vocab_size = self.config.vocab_size
-        # if PP disabled then draft will share embed with target
+        self.embed_tokens = VocabParallelEmbedding(
-        if get_pp_group().world_size > 1:
+            self.config.vocab_size,
-            self.embed_tokens = VocabParallelEmbedding(
+            self.config.hidden_size,
-                self.config.vocab_size,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
-                self.config.hidden_size,
+        )
                prefix=maybe_prefix(prefix, "embed_tokens"),
            )
        self.layers = nn.ModuleList([
            LlamaDecoderLayer(
@ -240,6 +237,7 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
        model_weights = {}
        includes_draft_id_mapping = False
        includes_embed_tokens = False
        for name, loaded_weight in weights:
            if "t2d" in name:
                continue
@ -248,12 +246,18 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
                includes_draft_id_mapping = True
            elif "lm_head" not in name:
                name = "model." + name
            if "embed_tokens" in name:
                includes_embed_tokens = True
            model_weights[name] = loaded_weight
        skip_substrs = []
        if not includes_draft_id_mapping:
            skip_substrs.append("draft_id_to_target_id")
        if not includes_embed_tokens:
            skip_substrs.append("embed_tokens")
        loader = AutoWeightsLoader(
            self,
            skip_prefixes=None,
-            skip_substrs=["draft_id_to_target_id"] \
+            skip_substrs=skip_substrs,
                if not includes_draft_id_mapping else None,
        )
        loader.load_weights(model_weights.items())
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -173,6 +173,7 @@ class CudaPlatformBase(Platform):
    def get_current_memory_usage(cls,
                                 device: Optional[torch.types.Device] = None
                                 ) -> float:
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats(device)
        return torch.cuda.max_memory_allocated(device)
--- a/vllm/utils.py
+++ b/vllm/utils.py
@ -899,6 +899,7 @@ class DeviceMemoryProfiler:
    def current_memory_usage(self) -> float:
        # Return the memory usage in bytes.
        from vllm.platforms import current_platform
        gc.collect()
        return current_platform.get_current_memory_usage(self.device)
    def __enter__(self):
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@ -330,16 +330,19 @@ class EagleProposer:
        self.attn_layer_names = list(draft_attn_layer_names)
        # share embed_tokens with the target model if needed
-        if get_pp_group().world_size == 1:
+        if get_pp_group().world_size == 1 \
            and self.model.model.embed_tokens.weight.shape \
                == target_model.model.embed_tokens.weight.shape:
            logger.info(
-                "The EAGLE head shares the same vocab embedding" \
+                "Assuming the EAGLE head shares the same vocab embedding" \
                " with the target model."
            )
            del self.model.model.embed_tokens
            self.model.model.embed_tokens = target_model.model.embed_tokens
        else:
            logger.info(
-                "Since PP > 1, the EAGLE head loaded its own vocab embedding" \
+                "The EAGLE head's vocab embedding will be loaded separately" \
-                " weights instead of sharing them with the target model."
+                " from the target model."
            )
        # share lm_head with the target model if needed