mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 19:14:57 +08:00
[Bugfix] If chunked_prefill is disabled, end the scheduling early. (#28911)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
parent
6330f9477d
commit
67fc16cd8c
@ -641,6 +641,34 @@ def test_schedule_concurrent_batches(
|
|||||||
scheduler.update_from_output(scheduler_output1, model_runner_output)
|
scheduler.update_from_output(scheduler_output1, model_runner_output)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
|
||||||
|
def test_schedule_order(enable_chunked_prefill: bool):
|
||||||
|
scheduler = create_scheduler(
|
||||||
|
max_num_batched_tokens=1024,
|
||||||
|
max_num_seqs=3,
|
||||||
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
|
)
|
||||||
|
|
||||||
|
# long requests
|
||||||
|
requests = create_requests(num_requests=2, num_tokens=800)
|
||||||
|
# short requests
|
||||||
|
requests += create_requests(num_requests=2, num_tokens=10)
|
||||||
|
|
||||||
|
for request in requests:
|
||||||
|
scheduler.add_request(request)
|
||||||
|
|
||||||
|
scheduler_output1 = scheduler.schedule()
|
||||||
|
|
||||||
|
if enable_chunked_prefill:
|
||||||
|
# When enable chunked prefill, long requests will be chunked.
|
||||||
|
assert len(scheduler_output1.scheduled_new_reqs) == 2
|
||||||
|
else:
|
||||||
|
# When disable chunked prefill, should not skip the long requests,
|
||||||
|
# and scheduling subsequent short requests in advance,
|
||||||
|
# even though there is still token budgets remaining.
|
||||||
|
assert len(scheduler_output1.scheduled_new_reqs) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_preempt_during_execution():
|
def test_preempt_during_execution():
|
||||||
# NOTE(woosuk): The actual number of available blocks is 10 instead of 11
|
# NOTE(woosuk): The actual number of available blocks is 10 instead of 11
|
||||||
# because block 0 is reserved as the null block.
|
# because block 0 is reserved as the null block.
|
||||||
|
|||||||
@ -42,6 +42,7 @@ def create_scheduler(
|
|||||||
model: str = "facebook/opt-125m",
|
model: str = "facebook/opt-125m",
|
||||||
max_num_seqs: int = 16,
|
max_num_seqs: int = 16,
|
||||||
max_num_batched_tokens: int = 8192,
|
max_num_batched_tokens: int = 8192,
|
||||||
|
enable_chunked_prefill: bool = True,
|
||||||
enable_prefix_caching: bool = False,
|
enable_prefix_caching: bool = False,
|
||||||
long_prefill_token_threshold: int = 0,
|
long_prefill_token_threshold: int = 0,
|
||||||
disable_chunked_mm_input: bool = False,
|
disable_chunked_mm_input: bool = False,
|
||||||
@ -76,7 +77,7 @@ def create_scheduler(
|
|||||||
max_model_len=max_model_len,
|
max_model_len=max_model_len,
|
||||||
long_prefill_token_threshold=long_prefill_token_threshold,
|
long_prefill_token_threshold=long_prefill_token_threshold,
|
||||||
disable_chunked_mm_input=disable_chunked_mm_input,
|
disable_chunked_mm_input=disable_chunked_mm_input,
|
||||||
enable_chunked_prefill=True,
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
async_scheduling=async_scheduling,
|
async_scheduling=async_scheduling,
|
||||||
)
|
)
|
||||||
model_config = ModelConfig(
|
model_config = ModelConfig(
|
||||||
|
|||||||
@ -508,9 +508,9 @@ class Scheduler(SchedulerInterface):
|
|||||||
not self.scheduler_config.enable_chunked_prefill
|
not self.scheduler_config.enable_chunked_prefill
|
||||||
and num_new_tokens > token_budget
|
and num_new_tokens > token_budget
|
||||||
):
|
):
|
||||||
self.waiting.pop_request()
|
# If chunked_prefill is disabled,
|
||||||
skipped_waiting_requests.prepend_request(request)
|
# we can stop the scheduling here.
|
||||||
continue
|
break
|
||||||
|
|
||||||
num_new_tokens = min(num_new_tokens, token_budget)
|
num_new_tokens = min(num_new_tokens, token_budget)
|
||||||
assert num_new_tokens > 0
|
assert num_new_tokens > 0
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user