From c5830381afbef44023ec1c97ae61ff02f22b1f9a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 12 Aug 2025 20:38:18 -0700 Subject: [PATCH] [V0 Deprecation] Remove args for multi-step scheduling (#22779) Signed-off-by: Woosuk Kwon --- tests/utils_/test_utils.py | 1 - vllm/config/scheduler.py | 27 +-------------------------- 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py index 8be1e103dc65..084d82dee11b 100644 --- a/tests/utils_/test_utils.py +++ b/tests/utils_/test_utils.py @@ -161,7 +161,6 @@ def parser_with_config(): parser.add_argument('--port', type=int) parser.add_argument('--tensor-parallel-size', type=int) parser.add_argument('--trust-remote-code', action='store_true') - parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean) return parser diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index db669600a0cc..93002012799a 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -115,12 +115,6 @@ class SchedulerConfig: (e.g., beam search), recomputation is not currently supported. In such a case, we use swapping instead.""" - num_scheduler_steps: int = 1 - """Maximum number of forward steps per scheduler call.""" - - multi_step_stream_outputs: bool = True - """If False, then multi-step will stream outputs at the end of all steps""" - send_delta_data: bool = False """Private API. If used, scheduler sends delta data to workers instead of an entire data. It should be enabled only @@ -193,16 +187,7 @@ class SchedulerConfig: if self.max_num_batched_tokens is None: if self.enable_chunked_prefill: - if self.num_scheduler_steps > 1: - # Multi-step Chunked-Prefill doesn't allow prompt-chunking - # for now. Have max_num_batched_tokens set to max_model_len - # so we don't reject sequences on account of a short - # max_num_batched_tokens. - self.max_num_batched_tokens = max( - self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) - else: - self.max_num_batched_tokens = ( - DEFAULT_MAX_NUM_BATCHED_TOKENS) + self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS else: # If max_model_len is too short, use # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value @@ -293,12 +278,6 @@ class SchedulerConfig: f"({self.num_lookahead_slots}) must be greater than or " "equal to 0.") - if self.num_scheduler_steps < 1: - raise ValueError( - "num_scheduler_steps " - f"({self.num_scheduler_steps}) must be greater than or " - "equal to 1.") - if self.max_num_partial_prefills < 1: raise ValueError( f"max_num_partial_prefills ({self.max_num_partial_prefills}) " @@ -323,7 +302,3 @@ class SchedulerConfig: f"max_num_partial_prefills ({self.max_num_partial_prefills}).") return self - - @property - def is_multi_step(self) -> bool: - return self.num_scheduler_steps > 1