From 8c7bab79f5f7747f37db00c9d018614c17de30dd Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Mon, 3 Jun 2024 03:36:38 +0000 Subject: [PATCH] simplify code --- vllm/core/scheduler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 7c70b1b244f7d..9b578872c3869 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -386,7 +386,7 @@ class Scheduler: chunked number of tokens are scheduled if `budget.num_batched_tokens` has not enough capacity to schedule all tokens. - + Returns: A tuple of remaining running queue (should be always 0) after scheduling and SchedulerRunningOutputs. @@ -651,11 +651,12 @@ class Scheduler: assert len(waiting_seqs) == 1, ( "Waiting sequence group should have only one prompt " "sequence.") + waiting_seq = waiting_seqs[0] num_new_tokens = self._get_num_new_tokens(seq_group, SequenceStatus.WAITING, enable_chunking, budget) if not enable_chunking: - num_prompt_tokens = waiting_seqs[0].get_len() + num_prompt_tokens = waiting_seq.get_len() assert num_new_tokens == num_prompt_tokens prompt_limit = self._get_prompt_limit(seq_group) @@ -663,8 +664,7 @@ class Scheduler: logger.warning( "Input prompt (%d tokens) is too long" " and exceeds limit of %d", num_new_tokens, prompt_limit) - for seq in waiting_seqs: - seq.status = SequenceStatus.FINISHED_IGNORED + waiting_seq.status = SequenceStatus.FINISHED_IGNORED ignored_seq_groups.append(seq_group) waiting_queue.popleft() continue @@ -727,7 +727,7 @@ class Scheduler: def _schedule_default(self) -> SchedulerOutputs: """Schedule queued requests. - + The current policy is designed to optimize the throughput. First, it batches as many prefill requests as possible. And it schedules decodes. If there's a pressure on GPU memory, decode requests can @@ -821,7 +821,7 @@ class Scheduler: def _schedule_chunked_prefill(self): """Schedule queued requests. - + Chunked prefill allows to chunk prefill requests, batch them together with decode requests. This policy 1. schedule as many decoding requests as possible. 2. schedule chunked prefill requests that are not