mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-03 07:11:21 +08:00
[Core] LoRA: V1 Scheduler optimization (#15422)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
parent
ac3cd6e83c
commit
a5cfbab3c8
@ -239,16 +239,16 @@ class Scheduler(SchedulerInterface):
|
|||||||
encoder_budget = new_encoder_budget
|
encoder_budget = new_encoder_budget
|
||||||
|
|
||||||
# Record the LoRAs in scheduled_running_reqs
|
# Record the LoRAs in scheduled_running_reqs
|
||||||
requested_loras: set[int] = set()
|
scheduled_loras: set[int] = set()
|
||||||
if self.lora_config:
|
if self.lora_config:
|
||||||
requested_loras = set(
|
scheduled_loras = set(
|
||||||
req.lora_request.lora_int_id for req in scheduled_running_reqs
|
req.lora_request.lora_int_id for req in scheduled_running_reqs
|
||||||
if req.lora_request and req.lora_request.lora_int_id > 0)
|
if req.lora_request and req.lora_request.lora_int_id > 0)
|
||||||
assert len(requested_loras) <= self.lora_config.max_loras
|
assert len(scheduled_loras) <= self.lora_config.max_loras
|
||||||
|
|
||||||
# Use a temporary deque to collect requests that need to be skipped
|
# Use a temporary deque to collect requests that need to be skipped
|
||||||
# and put back at the head of the waiting queue later
|
# and put back at the head of the waiting queue later
|
||||||
waiting_for_fsm: deque[Request] = deque()
|
skipped_waiting_requests: deque[Request] = deque()
|
||||||
|
|
||||||
# Next, schedule the WAITING requests.
|
# Next, schedule the WAITING requests.
|
||||||
if not preempted_reqs:
|
if not preempted_reqs:
|
||||||
@ -258,31 +258,30 @@ class Scheduler(SchedulerInterface):
|
|||||||
|
|
||||||
request = self.waiting[0]
|
request = self.waiting[0]
|
||||||
|
|
||||||
if request.status == RequestStatus.WAITING_FOR_FSM:
|
# Waiting request skipping logic
|
||||||
|
is_skipped = False
|
||||||
|
# Skip request if the structured output request is still waiting
|
||||||
|
# for FSM.
|
||||||
|
if (not is_skipped
|
||||||
|
and request.status == RequestStatus.WAITING_FOR_FSM):
|
||||||
structured_output_req = request.structured_output_request
|
structured_output_req = request.structured_output_request
|
||||||
if structured_output_req and structured_output_req.grammar:
|
is_skipped = (not structured_output_req
|
||||||
|
or not structured_output_req.grammar)
|
||||||
|
if not is_skipped:
|
||||||
request.status = RequestStatus.WAITING
|
request.status = RequestStatus.WAITING
|
||||||
else:
|
|
||||||
waiting_structured_output_req = self.waiting.popleft()
|
|
||||||
waiting_for_fsm.appendleft(
|
|
||||||
waiting_structured_output_req)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check that adding the request still respects the max_loras
|
# Skip request if max_loras can't be honored.
|
||||||
# constraint.
|
if (not is_skipped and self.lora_config
|
||||||
if self.lora_config and request.lora_request:
|
and request.lora_request):
|
||||||
req_lora_id = request.lora_request.lora_int_id
|
req_lora_id = request.lora_request.lora_int_id
|
||||||
if len(requested_loras) == self.lora_config.max_loras and (
|
is_skipped = (len(scheduled_loras)
|
||||||
req_lora_id not in requested_loras):
|
== self.lora_config.max_loras
|
||||||
# Cannot schedule.
|
and (req_lora_id not in scheduled_loras))
|
||||||
# TODO (varun): This means all the other requests in
|
|
||||||
# the WAITING queue will be blocked by this request,
|
if is_skipped:
|
||||||
# even if,
|
skipped_waiting_requests.appendleft(request)
|
||||||
# 1. these other requests do not use LoRA, or,
|
self.waiting.popleft()
|
||||||
# 2. these other requests use the already requested
|
continue
|
||||||
# LoRAs.
|
|
||||||
# This is too conservative and could be optimized.
|
|
||||||
break
|
|
||||||
|
|
||||||
# Get already-cached tokens.
|
# Get already-cached tokens.
|
||||||
computed_blocks, num_computed_tokens = \
|
computed_blocks, num_computed_tokens = \
|
||||||
@ -344,7 +343,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
f"Invalid request status: {request.status}")
|
f"Invalid request status: {request.status}")
|
||||||
|
|
||||||
if self.lora_config and request.lora_request:
|
if self.lora_config and request.lora_request:
|
||||||
requested_loras.add(request.lora_request.lora_int_id)
|
scheduled_loras.add(request.lora_request.lora_int_id)
|
||||||
req_to_new_block_ids[request.request_id] = [
|
req_to_new_block_ids[request.request_id] = [
|
||||||
b.block_id for b in computed_blocks + new_blocks
|
b.block_id for b in computed_blocks + new_blocks
|
||||||
]
|
]
|
||||||
@ -363,8 +362,8 @@ class Scheduler(SchedulerInterface):
|
|||||||
encoder_budget = new_encoder_budget
|
encoder_budget = new_encoder_budget
|
||||||
|
|
||||||
# Put back any skipped requests at the head of the waiting queue
|
# Put back any skipped requests at the head of the waiting queue
|
||||||
if waiting_for_fsm:
|
if skipped_waiting_requests:
|
||||||
self.waiting.extendleft(waiting_for_fsm)
|
self.waiting.extendleft(skipped_waiting_requests)
|
||||||
|
|
||||||
# Check if the scheduling constraints are satisfied.
|
# Check if the scheduling constraints are satisfied.
|
||||||
total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
|
total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user