diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 09acde6e08fa..fe4153e60997 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -641,6 +641,34 @@ def test_schedule_concurrent_batches( scheduler.update_from_output(scheduler_output1, model_runner_output) +@pytest.mark.parametrize("enable_chunked_prefill", [True, False]) +def test_schedule_order(enable_chunked_prefill: bool): + scheduler = create_scheduler( + max_num_batched_tokens=1024, + max_num_seqs=3, + enable_chunked_prefill=enable_chunked_prefill, + ) + + # long requests + requests = create_requests(num_requests=2, num_tokens=800) + # short requests + requests += create_requests(num_requests=2, num_tokens=10) + + for request in requests: + scheduler.add_request(request) + + scheduler_output1 = scheduler.schedule() + + if enable_chunked_prefill: + # When enable chunked prefill, long requests will be chunked. + assert len(scheduler_output1.scheduled_new_reqs) == 2 + else: + # When disable chunked prefill, should not skip the long requests, + # and scheduling subsequent short requests in advance, + # even though there is still token budgets remaining. + assert len(scheduler_output1.scheduled_new_reqs) == 1 + + def test_preempt_during_execution(): # NOTE(woosuk): The actual number of available blocks is 10 instead of 11 # because block 0 is reserved as the null block. diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 6830f6873645..7537c7a60476 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -42,6 +42,7 @@ def create_scheduler( model: str = "facebook/opt-125m", max_num_seqs: int = 16, max_num_batched_tokens: int = 8192, + enable_chunked_prefill: bool = True, enable_prefix_caching: bool = False, long_prefill_token_threshold: int = 0, disable_chunked_mm_input: bool = False, @@ -76,7 +77,7 @@ def create_scheduler( max_model_len=max_model_len, long_prefill_token_threshold=long_prefill_token_threshold, disable_chunked_mm_input=disable_chunked_mm_input, - enable_chunked_prefill=True, + enable_chunked_prefill=enable_chunked_prefill, async_scheduling=async_scheduling, ) model_config = ModelConfig( diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index a7ec0de37263..23af014c1036 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -508,9 +508,9 @@ class Scheduler(SchedulerInterface): not self.scheduler_config.enable_chunked_prefill and num_new_tokens > token_budget ): - self.waiting.pop_request() - skipped_waiting_requests.prepend_request(request) - continue + # If chunked_prefill is disabled, + # we can stop the scheduling here. + break num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0