diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 8194295ffedb6..b6078706daacf 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -62,15 +62,6 @@ class SchedulerConfig: """For chunked prefill, a request is considered long if the prompt is longer than this number of tokens.""" - num_lookahead_slots: int = Field(default=0, ge=0) - """The number of slots to allocate per sequence per - step, beyond the known token ids. This is used in speculative - decoding to store KV activations of tokens which may or may not be - accepted. - - NOTE: This will be replaced by speculative config in the future; it is - present to enable correctness tests until then.""" - enable_chunked_prefill: bool = True """If True, prefill requests can be chunked based on the remaining `max_num_batched_tokens`. diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 13a8632413d91..a0c65b6049e1e 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -634,16 +634,6 @@ class SpeculativeConfig: return self - @property - def num_lookahead_slots(self) -> int: - """The number of additional slots the scheduler should allocate per - step, in addition to the slots allocated for each known token. - - This is equal to the number of speculative tokens, as each speculative - token must be scored. - """ - return self.num_speculative_tokens - def use_eagle(self) -> bool: return self.method in ("eagle", "eagle3", "mtp") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 68205b6079d78..74828bc109cbe 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -488,7 +488,6 @@ class EngineArgs: ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override - num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config") ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns") @@ -1081,9 +1080,6 @@ class EngineArgs: "--long-prefill-token-threshold", **scheduler_kwargs["long_prefill_token_threshold"], ) - scheduler_group.add_argument( - "--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"] - ) # multi-step scheduling has been removed; corresponding arguments # are no longer supported. scheduler_group.add_argument( @@ -1653,18 +1649,11 @@ class EngineArgs: target_parallel_config=parallel_config, ) - # make sure num_lookahead_slots is set appropriately depending on - # whether speculative decoding is enabled - num_lookahead_slots = self.num_lookahead_slots - if speculative_config is not None: - num_lookahead_slots = speculative_config.num_lookahead_slots - scheduler_config = SchedulerConfig( runner_type=model_config.runner_type, max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, max_model_len=model_config.max_model_len, - num_lookahead_slots=num_lookahead_slots, enable_chunked_prefill=self.enable_chunked_prefill, disable_chunked_mm_input=self.disable_chunked_mm_input, is_multimodal_model=model_config.is_multimodal_model,