mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-03 09:07:02 +08:00
simplify code
This commit is contained in:
parent
1936d7bab0
commit
8c7bab79f5
@ -386,7 +386,7 @@ class Scheduler:
|
||||
chunked number of tokens are scheduled if
|
||||
`budget.num_batched_tokens` has not enough capacity to schedule
|
||||
all tokens.
|
||||
|
||||
|
||||
Returns:
|
||||
A tuple of remaining running queue (should be always 0) after
|
||||
scheduling and SchedulerRunningOutputs.
|
||||
@ -651,11 +651,12 @@ class Scheduler:
|
||||
assert len(waiting_seqs) == 1, (
|
||||
"Waiting sequence group should have only one prompt "
|
||||
"sequence.")
|
||||
waiting_seq = waiting_seqs[0]
|
||||
num_new_tokens = self._get_num_new_tokens(seq_group,
|
||||
SequenceStatus.WAITING,
|
||||
enable_chunking, budget)
|
||||
if not enable_chunking:
|
||||
num_prompt_tokens = waiting_seqs[0].get_len()
|
||||
num_prompt_tokens = waiting_seq.get_len()
|
||||
assert num_new_tokens == num_prompt_tokens
|
||||
|
||||
prompt_limit = self._get_prompt_limit(seq_group)
|
||||
@ -663,8 +664,7 @@ class Scheduler:
|
||||
logger.warning(
|
||||
"Input prompt (%d tokens) is too long"
|
||||
" and exceeds limit of %d", num_new_tokens, prompt_limit)
|
||||
for seq in waiting_seqs:
|
||||
seq.status = SequenceStatus.FINISHED_IGNORED
|
||||
waiting_seq.status = SequenceStatus.FINISHED_IGNORED
|
||||
ignored_seq_groups.append(seq_group)
|
||||
waiting_queue.popleft()
|
||||
continue
|
||||
@ -727,7 +727,7 @@ class Scheduler:
|
||||
|
||||
def _schedule_default(self) -> SchedulerOutputs:
|
||||
"""Schedule queued requests.
|
||||
|
||||
|
||||
The current policy is designed to optimize the throughput. First,
|
||||
it batches as many prefill requests as possible. And it schedules
|
||||
decodes. If there's a pressure on GPU memory, decode requests can
|
||||
@ -821,7 +821,7 @@ class Scheduler:
|
||||
|
||||
def _schedule_chunked_prefill(self):
|
||||
"""Schedule queued requests.
|
||||
|
||||
|
||||
Chunked prefill allows to chunk prefill requests, batch them together
|
||||
with decode requests. This policy 1. schedule as many decoding requests
|
||||
as possible. 2. schedule chunked prefill requests that are not
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user