[V0 Deprecation] Remove num_lookahead_slots (#29000)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-06-03 16:44:26 +08:00 · 2025-11-20 14:39:10 +08:00 · 2025-11-20 14:39:10 +08:00 · 20e4497be2
commit 20e4497be2
parent 1c7bcc55b8
3 changed files with 0 additions and 30 deletions
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@ -62,15 +62,6 @@ class SchedulerConfig:
    """For chunked prefill, a request is considered long if the prompt is
    longer than this number of tokens."""
    num_lookahead_slots: int = Field(default=0, ge=0)
    """The number of slots to allocate per sequence per
    step, beyond the known token ids. This is used in speculative
    decoding to store KV activations of tokens which may or may not be
    accepted.
    NOTE: This will be replaced by speculative config in the future; it is
    present to enable correctness tests until then."""
    enable_chunked_prefill: bool = True
    """If True, prefill requests can be chunked based
    on the remaining `max_num_batched_tokens`.
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@ -634,16 +634,6 @@ class SpeculativeConfig:
        return self
    @property
    def num_lookahead_slots(self) -> int:
        """The number of additional slots the scheduler should allocate per
        step, in addition to the slots allocated for each known token.
        This is equal to the number of speculative tokens, as each speculative
        token must be scored.
        """
        return self.num_speculative_tokens
    def use_eagle(self) -> bool:
        return self.method in ("eagle", "eagle3", "mtp")
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -488,7 +488,6 @@ class EngineArgs:
    ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
    num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
    num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
    model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
    ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
@ -1081,9 +1080,6 @@ class EngineArgs:
            "--long-prefill-token-threshold",
            **scheduler_kwargs["long_prefill_token_threshold"],
        )
        scheduler_group.add_argument(
            "--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]
        )
        # multi-step scheduling has been removed; corresponding arguments
        # are no longer supported.
        scheduler_group.add_argument(
@ -1653,18 +1649,11 @@ class EngineArgs:
            target_parallel_config=parallel_config,
        )
        # make sure num_lookahead_slots is set appropriately depending on
        # whether speculative decoding is enabled
        num_lookahead_slots = self.num_lookahead_slots
        if speculative_config is not None:
            num_lookahead_slots = speculative_config.num_lookahead_slots
        scheduler_config = SchedulerConfig(
            runner_type=model_config.runner_type,
            max_num_batched_tokens=self.max_num_batched_tokens,
            max_num_seqs=self.max_num_seqs,
            max_model_len=model_config.max_model_len,
            num_lookahead_slots=num_lookahead_slots,
            enable_chunked_prefill=self.enable_chunked_prefill,
            disable_chunked_mm_input=self.disable_chunked_mm_input,
            is_multimodal_model=model_config.is_multimodal_model,