[Model Runner V2] Add minor clarification comments for Eagle (#29332)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2026-05-30 10:57:05 +08:00 · 2025-11-24 11:28:56 -08:00 · 2025-11-24 11:28:56 -08:00 · 97588c4d12
commit 97588c4d12
parent 839c6b7b72
1 changed files with 11 additions and 0 deletions
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@ -65,6 +65,12 @@ class EagleSpeculator:
        # [num_reqs]
        next_prefill_tokens: torch.Tensor,
    ) -> torch.Tensor:
+        # NOTE(woosuk): To avoid CPU-GPU synchronization without CPU knowing the
+        # number of rejected tokens, we maintain the size of eagle's input_ids and
+        # hidden_states the same as the target model's. This means, we pad each
+        # request's query length to include any rejected positions. By doing so,
+        # we can also reuse the attention metadata (e.g., query_start_loc,
+        # seq_lens) of the target model.
        if aux_hidden_states:
            assert self.method == "eagle3"
            hidden_states = self.model.combine_hidden_states(
@ -110,6 +116,11 @@ class EagleSpeculator:
        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
        # used for draft and target sampling.
        pos = input_batch.positions[last_token_indices] + 1
+        # NOTE(woosuk): For draft sampling, we only consider the temperature
+        # and ignore the other sampling parameters such as top_k and top_p,
+        # for simplicity and performance.
+        # While this may slightly degrade the acceptance rate, it does not
+        # affect the output distribution after rejection sampling.
        draft_tokens = gumbel_sample(
            logits, temperature, seed, pos, apply_temperature=True
        )