simplify code

This commit is contained in:
Zhuohan Li 2024-06-03 03:36:38 +00:00
parent 1936d7bab0
commit 8c7bab79f5

View File

@ -386,7 +386,7 @@ class Scheduler:
chunked number of tokens are scheduled if
`budget.num_batched_tokens` has not enough capacity to schedule
all tokens.
Returns:
A tuple of remaining running queue (should be always 0) after
scheduling and SchedulerRunningOutputs.
@ -651,11 +651,12 @@ class Scheduler:
assert len(waiting_seqs) == 1, (
"Waiting sequence group should have only one prompt "
"sequence.")
waiting_seq = waiting_seqs[0]
num_new_tokens = self._get_num_new_tokens(seq_group,
SequenceStatus.WAITING,
enable_chunking, budget)
if not enable_chunking:
num_prompt_tokens = waiting_seqs[0].get_len()
num_prompt_tokens = waiting_seq.get_len()
assert num_new_tokens == num_prompt_tokens
prompt_limit = self._get_prompt_limit(seq_group)
@ -663,8 +664,7 @@ class Scheduler:
logger.warning(
"Input prompt (%d tokens) is too long"
" and exceeds limit of %d", num_new_tokens, prompt_limit)
for seq in waiting_seqs:
seq.status = SequenceStatus.FINISHED_IGNORED
waiting_seq.status = SequenceStatus.FINISHED_IGNORED
ignored_seq_groups.append(seq_group)
waiting_queue.popleft()
continue
@ -727,7 +727,7 @@ class Scheduler:
def _schedule_default(self) -> SchedulerOutputs:
"""Schedule queued requests.
The current policy is designed to optimize the throughput. First,
it batches as many prefill requests as possible. And it schedules
decodes. If there's a pressure on GPU memory, decode requests can
@ -821,7 +821,7 @@ class Scheduler:
def _schedule_chunked_prefill(self):
"""Schedule queued requests.
Chunked prefill allows to chunk prefill requests, batch them together
with decode requests. This policy 1. schedule as many decoding requests
as possible. 2. schedule chunked prefill requests that are not