[V0 Deprecation] Remove num_lookahead_slots (#29000)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-03-16 11:57:14 +08:00 · 2025-11-20 14:39:10 +08:00 · 2025-11-20 14:39:10 +08:00 · 20e4497be2
commit 20e4497be2
parent 1c7bcc55b8
3 changed files with 0 additions and 30 deletions
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@ -62,15 +62,6 @@ class SchedulerConfig:
    """For chunked prefill, a request is considered long if the prompt is
    longer than this number of tokens."""

-    num_lookahead_slots: int = Field(default=0, ge=0)
-    """The number of slots to allocate per sequence per
-    step, beyond the known token ids. This is used in speculative
-    decoding to store KV activations of tokens which may or may not be
-    accepted.
-
-    NOTE: This will be replaced by speculative config in the future; it is
-    present to enable correctness tests until then."""
-
    enable_chunked_prefill: bool = True
    """If True, prefill requests can be chunked based
    on the remaining `max_num_batched_tokens`.
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@ -634,16 +634,6 @@ class SpeculativeConfig:

        return self

-    @property
-    def num_lookahead_slots(self) -> int:
-        """The number of additional slots the scheduler should allocate per
-        step, in addition to the slots allocated for each known token.
-
-        This is equal to the number of speculative tokens, as each speculative
-        token must be scored.
-        """
-        return self.num_speculative_tokens
-
    def use_eagle(self) -> bool:
        return self.method in ("eagle", "eagle3", "mtp")

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -488,7 +488,6 @@ class EngineArgs:

    ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
    num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
-    num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
    model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
    ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")

@ -1081,9 +1080,6 @@ class EngineArgs:
            "--long-prefill-token-threshold",
            **scheduler_kwargs["long_prefill_token_threshold"],
        )
-        scheduler_group.add_argument(
-            "--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]
-        )
        # multi-step scheduling has been removed; corresponding arguments
        # are no longer supported.
        scheduler_group.add_argument(
@ -1653,18 +1649,11 @@ class EngineArgs:
            target_parallel_config=parallel_config,
        )

-        # make sure num_lookahead_slots is set appropriately depending on
-        # whether speculative decoding is enabled
-        num_lookahead_slots = self.num_lookahead_slots
-        if speculative_config is not None:
-            num_lookahead_slots = speculative_config.num_lookahead_slots
-
        scheduler_config = SchedulerConfig(
            runner_type=model_config.runner_type,
            max_num_batched_tokens=self.max_num_batched_tokens,
            max_num_seqs=self.max_num_seqs,
            max_model_len=model_config.max_model_len,
-            num_lookahead_slots=num_lookahead_slots,
            enable_chunked_prefill=self.enable_chunked_prefill,
            disable_chunked_mm_input=self.disable_chunked_mm_input,
            is_multimodal_model=model_config.is_multimodal_model,