From c5830381afbef44023ec1c97ae61ff02f22b1f9a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Aug 2025 20:38:18 -0700
Subject: [PATCH] [V0 Deprecation] Remove args for multi-step scheduling
 (#22779)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 tests/utils_/test_utils.py |  1 -
 vllm/config/scheduler.py   | 27 +--------------------------
 2 files changed, 1 insertion(+), 27 deletions(-)

diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index 8be1e103dc65..084d82dee11b 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -161,7 +161,6 @@ def parser_with_config():
     parser.add_argument('--port', type=int)
     parser.add_argument('--tensor-parallel-size', type=int)
     parser.add_argument('--trust-remote-code', action='store_true')
-    parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean)
     return parser
 
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index db669600a0cc..93002012799a 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -115,12 +115,6 @@ class SchedulerConfig:
     (e.g., beam search), recomputation is not currently supported. In
     such a case, we use swapping instead."""
 
-    num_scheduler_steps: int = 1
-    """Maximum number of forward steps per scheduler call."""
-
-    multi_step_stream_outputs: bool = True
-    """If False, then multi-step will stream outputs at the end of all steps"""
-
     send_delta_data: bool = False
     """Private API. If used, scheduler sends delta data to
     workers instead of an entire data. It should be enabled only
@@ -193,16 +187,7 @@ class SchedulerConfig:
 
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
-                if self.num_scheduler_steps > 1:
-                    # Multi-step Chunked-Prefill doesn't allow prompt-chunking
-                    # for now. Have max_num_batched_tokens set to max_model_len
-                    # so we don't reject sequences on account of a short
-                    # max_num_batched_tokens.
-                    self.max_num_batched_tokens = max(
-                        self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
-                else:
-                    self.max_num_batched_tokens = (
-                        DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
             else:
                 # If max_model_len is too short, use
                 # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
@@ -293,12 +278,6 @@ class SchedulerConfig:
                 f"({self.num_lookahead_slots}) must be greater than or "
                 "equal to 0.")
 
-        if self.num_scheduler_steps < 1:
-            raise ValueError(
-                "num_scheduler_steps "
-                f"({self.num_scheduler_steps}) must be greater than or "
-                "equal to 1.")
-
         if self.max_num_partial_prefills < 1:
             raise ValueError(
                 f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
@@ -323,7 +302,3 @@ class SchedulerConfig:
                 f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
 
         return self
-
-    @property
-    def is_multi_step(self) -> bool:
-        return self.num_scheduler_steps > 1