[V0 Deprecation] Remove args for multi-step scheduling (#22779)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
This commit is contained in:
Woosuk Kwon 2025-08-12 20:38:18 -07:00 committed by GitHub
parent d31f97cf57
commit c5830381af
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 1 additions and 27 deletions

View File

@ -161,7 +161,6 @@ def parser_with_config():
parser.add_argument('--port', type=int) parser.add_argument('--port', type=int)
parser.add_argument('--tensor-parallel-size', type=int) parser.add_argument('--tensor-parallel-size', type=int)
parser.add_argument('--trust-remote-code', action='store_true') parser.add_argument('--trust-remote-code', action='store_true')
parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean)
return parser return parser

View File

@ -115,12 +115,6 @@ class SchedulerConfig:
(e.g., beam search), recomputation is not currently supported. In (e.g., beam search), recomputation is not currently supported. In
such a case, we use swapping instead.""" such a case, we use swapping instead."""
num_scheduler_steps: int = 1
"""Maximum number of forward steps per scheduler call."""
multi_step_stream_outputs: bool = True
"""If False, then multi-step will stream outputs at the end of all steps"""
send_delta_data: bool = False send_delta_data: bool = False
"""Private API. If used, scheduler sends delta data to """Private API. If used, scheduler sends delta data to
workers instead of an entire data. It should be enabled only workers instead of an entire data. It should be enabled only
@ -193,16 +187,7 @@ class SchedulerConfig:
if self.max_num_batched_tokens is None: if self.max_num_batched_tokens is None:
if self.enable_chunked_prefill: if self.enable_chunked_prefill:
if self.num_scheduler_steps > 1: self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
# Multi-step Chunked-Prefill doesn't allow prompt-chunking
# for now. Have max_num_batched_tokens set to max_model_len
# so we don't reject sequences on account of a short
# max_num_batched_tokens.
self.max_num_batched_tokens = max(
self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
else:
self.max_num_batched_tokens = (
DEFAULT_MAX_NUM_BATCHED_TOKENS)
else: else:
# If max_model_len is too short, use # If max_model_len is too short, use
# DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
@ -293,12 +278,6 @@ class SchedulerConfig:
f"({self.num_lookahead_slots}) must be greater than or " f"({self.num_lookahead_slots}) must be greater than or "
"equal to 0.") "equal to 0.")
if self.num_scheduler_steps < 1:
raise ValueError(
"num_scheduler_steps "
f"({self.num_scheduler_steps}) must be greater than or "
"equal to 1.")
if self.max_num_partial_prefills < 1: if self.max_num_partial_prefills < 1:
raise ValueError( raise ValueError(
f"max_num_partial_prefills ({self.max_num_partial_prefills}) " f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
@ -323,7 +302,3 @@ class SchedulerConfig:
f"max_num_partial_prefills ({self.max_num_partial_prefills}).") f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
return self return self
@property
def is_multi_step(self) -> bool:
return self.num_scheduler_steps > 1