mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-03 16:44:26 +08:00
[V0 Deprecation] Remove num_lookahead_slots (#29000)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
parent
1c7bcc55b8
commit
20e4497be2
@ -62,15 +62,6 @@ class SchedulerConfig:
|
|||||||
"""For chunked prefill, a request is considered long if the prompt is
|
"""For chunked prefill, a request is considered long if the prompt is
|
||||||
longer than this number of tokens."""
|
longer than this number of tokens."""
|
||||||
|
|
||||||
num_lookahead_slots: int = Field(default=0, ge=0)
|
|
||||||
"""The number of slots to allocate per sequence per
|
|
||||||
step, beyond the known token ids. This is used in speculative
|
|
||||||
decoding to store KV activations of tokens which may or may not be
|
|
||||||
accepted.
|
|
||||||
|
|
||||||
NOTE: This will be replaced by speculative config in the future; it is
|
|
||||||
present to enable correctness tests until then."""
|
|
||||||
|
|
||||||
enable_chunked_prefill: bool = True
|
enable_chunked_prefill: bool = True
|
||||||
"""If True, prefill requests can be chunked based
|
"""If True, prefill requests can be chunked based
|
||||||
on the remaining `max_num_batched_tokens`.
|
on the remaining `max_num_batched_tokens`.
|
||||||
|
|||||||
@ -634,16 +634,6 @@ class SpeculativeConfig:
|
|||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@property
|
|
||||||
def num_lookahead_slots(self) -> int:
|
|
||||||
"""The number of additional slots the scheduler should allocate per
|
|
||||||
step, in addition to the slots allocated for each known token.
|
|
||||||
|
|
||||||
This is equal to the number of speculative tokens, as each speculative
|
|
||||||
token must be scored.
|
|
||||||
"""
|
|
||||||
return self.num_speculative_tokens
|
|
||||||
|
|
||||||
def use_eagle(self) -> bool:
|
def use_eagle(self) -> bool:
|
||||||
return self.method in ("eagle", "eagle3", "mtp")
|
return self.method in ("eagle", "eagle3", "mtp")
|
||||||
|
|
||||||
|
|||||||
@ -488,7 +488,6 @@ class EngineArgs:
|
|||||||
|
|
||||||
ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
|
ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
|
||||||
num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
|
num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
|
||||||
num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
|
|
||||||
model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
|
model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
|
||||||
ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
|
ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
|
||||||
|
|
||||||
@ -1081,9 +1080,6 @@ class EngineArgs:
|
|||||||
"--long-prefill-token-threshold",
|
"--long-prefill-token-threshold",
|
||||||
**scheduler_kwargs["long_prefill_token_threshold"],
|
**scheduler_kwargs["long_prefill_token_threshold"],
|
||||||
)
|
)
|
||||||
scheduler_group.add_argument(
|
|
||||||
"--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]
|
|
||||||
)
|
|
||||||
# multi-step scheduling has been removed; corresponding arguments
|
# multi-step scheduling has been removed; corresponding arguments
|
||||||
# are no longer supported.
|
# are no longer supported.
|
||||||
scheduler_group.add_argument(
|
scheduler_group.add_argument(
|
||||||
@ -1653,18 +1649,11 @@ class EngineArgs:
|
|||||||
target_parallel_config=parallel_config,
|
target_parallel_config=parallel_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
# make sure num_lookahead_slots is set appropriately depending on
|
|
||||||
# whether speculative decoding is enabled
|
|
||||||
num_lookahead_slots = self.num_lookahead_slots
|
|
||||||
if speculative_config is not None:
|
|
||||||
num_lookahead_slots = speculative_config.num_lookahead_slots
|
|
||||||
|
|
||||||
scheduler_config = SchedulerConfig(
|
scheduler_config = SchedulerConfig(
|
||||||
runner_type=model_config.runner_type,
|
runner_type=model_config.runner_type,
|
||||||
max_num_batched_tokens=self.max_num_batched_tokens,
|
max_num_batched_tokens=self.max_num_batched_tokens,
|
||||||
max_num_seqs=self.max_num_seqs,
|
max_num_seqs=self.max_num_seqs,
|
||||||
max_model_len=model_config.max_model_len,
|
max_model_len=model_config.max_model_len,
|
||||||
num_lookahead_slots=num_lookahead_slots,
|
|
||||||
enable_chunked_prefill=self.enable_chunked_prefill,
|
enable_chunked_prefill=self.enable_chunked_prefill,
|
||||||
disable_chunked_mm_input=self.disable_chunked_mm_input,
|
disable_chunked_mm_input=self.disable_chunked_mm_input,
|
||||||
is_multimodal_model=model_config.is_multimodal_model,
|
is_multimodal_model=model_config.is_multimodal_model,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user