[V0 Deprecation] Remove num_lookahead_slots (#29000)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
Cyrus Leung 2025-11-20 14:39:10 +08:00 committed by GitHub
parent 1c7bcc55b8
commit 20e4497be2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 0 additions and 30 deletions

View File

@ -62,15 +62,6 @@ class SchedulerConfig:
"""For chunked prefill, a request is considered long if the prompt is
longer than this number of tokens."""
num_lookahead_slots: int = Field(default=0, ge=0)
"""The number of slots to allocate per sequence per
step, beyond the known token ids. This is used in speculative
decoding to store KV activations of tokens which may or may not be
accepted.
NOTE: This will be replaced by speculative config in the future; it is
present to enable correctness tests until then."""
enable_chunked_prefill: bool = True
"""If True, prefill requests can be chunked based
on the remaining `max_num_batched_tokens`.

View File

@ -634,16 +634,6 @@ class SpeculativeConfig:
return self
@property
def num_lookahead_slots(self) -> int:
"""The number of additional slots the scheduler should allocate per
step, in addition to the slots allocated for each known token.
This is equal to the number of speculative tokens, as each speculative
token must be scored.
"""
return self.num_speculative_tokens
def use_eagle(self) -> bool:
return self.method in ("eagle", "eagle3", "mtp")

View File

@ -488,7 +488,6 @@ class EngineArgs:
ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
@ -1081,9 +1080,6 @@ class EngineArgs:
"--long-prefill-token-threshold",
**scheduler_kwargs["long_prefill_token_threshold"],
)
scheduler_group.add_argument(
"--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]
)
# multi-step scheduling has been removed; corresponding arguments
# are no longer supported.
scheduler_group.add_argument(
@ -1653,18 +1649,11 @@ class EngineArgs:
target_parallel_config=parallel_config,
)
# make sure num_lookahead_slots is set appropriately depending on
# whether speculative decoding is enabled
num_lookahead_slots = self.num_lookahead_slots
if speculative_config is not None:
num_lookahead_slots = speculative_config.num_lookahead_slots
scheduler_config = SchedulerConfig(
runner_type=model_config.runner_type,
max_num_batched_tokens=self.max_num_batched_tokens,
max_num_seqs=self.max_num_seqs,
max_model_len=model_config.max_model_len,
num_lookahead_slots=num_lookahead_slots,
enable_chunked_prefill=self.enable_chunked_prefill,
disable_chunked_mm_input=self.disable_chunked_mm_input,
is_multimodal_model=model_config.is_multimodal_model,