[V1][PP] Run engine busy loop with batch queue (#13064)

This commit is contained in:
Cody Yu 2025-02-15 03:59:01 -08:00 committed by GitHub
parent ed0de3e4b8
commit 9206b3d7ec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 299 additions and 15 deletions

View File

@ -213,3 +213,54 @@ def test_schedule_partial_requests():
assert output.num_scheduled_tokens[requests[0].request_id] == 1 assert output.num_scheduled_tokens[requests[0].request_id] == 1
assert output.num_scheduled_tokens[requests[1].request_id] == 700 assert output.num_scheduled_tokens[requests[1].request_id] == 700
assert requests[2].request_id not in output.num_scheduled_tokens assert requests[2].request_id not in output.num_scheduled_tokens
def test_schedule_concurrent_batches():
scheduler = create_scheduler(
max_num_batched_tokens=1024,
max_num_seqs=2,
)
requests = create_requests(
num_requests=2,
num_tokens=512,
)
# Schedule the first request.
scheduler.add_request(requests[0])
scheduler_output0 = scheduler.schedule()
assert len(scheduler_output0.scheduled_new_reqs) == 1
assert scheduler_output0.num_scheduled_tokens[
requests[0].request_id] == 512
# The first request is still running, so only schedule the second request.
scheduler.add_request(requests[1])
scheduler_output1 = scheduler.schedule()
assert len(scheduler_output1.scheduled_new_reqs) == 1
assert scheduler_output1.num_scheduled_tokens[
requests[1].request_id] == 512
# Model output of the first request.
model_runner_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[0],
logprobs=None,
prompt_logprobs_dict={},
)
scheduler.update_from_output(scheduler_output0, model_runner_output)
# Schedule the next step.
# The first request can be scheduled again while the second
# request is still running.
scheduler_output2 = scheduler.schedule()
assert scheduler_output2.num_scheduled_tokens[requests[0].request_id] == 1
# Model output of the second request.
model_runner_output = ModelRunnerOutput(
req_ids=[requests[1].request_id],
req_id_to_index={requests[1].request_id: 0},
sampled_token_ids=[0],
logprobs=None,
prompt_logprobs_dict={},
)
scheduler.update_from_output(scheduler_output1, model_runner_output)

View File

@ -1,7 +1,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import copy
import threading
import time import time
import uuid import uuid
from concurrent.futures import Future
import pytest import pytest
from transformers import AutoTokenizer from transformers import AutoTokenizer
@ -12,7 +15,9 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.core import EngineCore from vllm.v1.engine.core import EngineCore
from vllm.v1.executor.abstract import Executor from vllm.v1.executor.abstract import Executor, UniProcExecutor
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.outputs import ModelRunnerOutput
if not current_platform.is_cuda(): if not current_platform.is_cuda():
pytest.skip(reason="V1 currently only supported on CUDA.", pytest.skip(reason="V1 currently only supported on CUDA.",
@ -191,3 +196,85 @@ def test_engine_core_advanced_sampling(monkeypatch):
) )
engine_core.add_request(request2) engine_core.add_request(request2)
_check_engine_state() _check_engine_state()
@fork_new_process_for_each_test
def test_engine_core_concurrent_batches(monkeypatch):
"""
Test that the engine can handle multiple concurrent batches.
"""
def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
request = make_request()
request.sampling_params.max_tokens = max_tokens
return request
class DummyExecutor(UniProcExecutor):
def initialize(self, kv_cache_config: KVCacheConfig) -> None:
super().initialize(kv_cache_config)
# This executor actually can only run 1 batch at a time
self.semaphore = threading.Semaphore(1)
def execute_model(
self,
scheduler_output,
) -> Future[ModelRunnerOutput]:
"""Make execute_model non-blocking."""
future: Future[ModelRunnerOutput] = Future()
def _thread_wrapper(scheduler_output, future):
with self.semaphore:
output = self.collective_rpc("execute_model",
args=(scheduler_output, ))
# Make a copy because output[0] may be reused
# by the next batch.
output = copy.deepcopy(output[0])
future.set_result(output)
threading.Thread(target=_thread_wrapper,
args=(scheduler_output, future)).start()
return future
@property
def max_concurrent_batches(self) -> int:
return 2
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
engine_args = EngineArgs(
model=MODEL_NAME,
# To test concurrent batches.
max_num_seqs=2,
# Avoid all requests being scheduled once.
enable_prefix_caching=False,
max_num_batched_tokens=10,
)
vllm_config = engine_args.create_engine_config()
engine_core = EngineCore(vllm_config=vllm_config,
log_stats=False,
executor_class=DummyExecutor)
assert engine_core.batch_queue is not None
# Add two requests in a row.
req = make_request_with_max_tokens(5)
engine_core.add_request(req)
req = make_request_with_max_tokens(5)
engine_core.add_request(req)
# First saturate the batch queue.
assert engine_core.step_with_batch_queue() is None
assert engine_core.batch_queue.qsize() == 1
assert engine_core.step_with_batch_queue() is None
assert engine_core.batch_queue.qsize() == 2
assert engine_core.scheduler.get_num_unfinished_requests() == 2
# Loop through both requests.
while engine_core.scheduler.get_num_unfinished_requests() == 2:
engine_core.step_with_batch_queue()
# Reaching here when got the result of the first request.
while engine_core.scheduler.get_num_unfinished_requests() == 1:
engine_core.step_with_batch_queue()

View File

@ -58,6 +58,9 @@ class Scheduler:
# Priority queues for requests. # Priority queues for requests.
self.waiting: Deque[Request] = deque() self.waiting: Deque[Request] = deque()
self.running: List[Request] = [] self.running: List[Request] = []
# The requests that have been scheduled and are being executed
# by the executor.
self.scheduled_req_ids: Set[str] = set()
# The request IDs that are finished in between the previous and the # The request IDs that are finished in between the previous and the
# current steps. This is used to notify the workers about the finished # current steps. This is used to notify the workers about the finished
@ -118,6 +121,11 @@ class Scheduler:
req_index = 0 req_index = 0
while req_index < len(self.running) and token_budget > 0: while req_index < len(self.running) and token_budget > 0:
request = self.running[req_index] request = self.running[req_index]
if request.request_id in self.scheduled_req_ids:
# This request has already been scheduled.
req_index += 1
continue
num_new_tokens = request.num_tokens - request.num_computed_tokens num_new_tokens = request.num_tokens - request.num_computed_tokens
num_new_tokens = min(num_new_tokens, token_budget) num_new_tokens = min(num_new_tokens, token_budget)
assert num_new_tokens > 0 assert num_new_tokens > 0
@ -164,6 +172,7 @@ class Scheduler:
# Schedule the request. # Schedule the request.
scheduled_running_reqs.append(request) scheduled_running_reqs.append(request)
self.scheduled_req_ids.add(request.request_id)
req_to_new_block_ids[request.request_id] = [ req_to_new_block_ids[request.request_id] = [
b.block_id for b in new_blocks b.block_id for b in new_blocks
] ]
@ -251,6 +260,7 @@ class Scheduler:
self.waiting.popleft() self.waiting.popleft()
self.running.append(request) self.running.append(request)
self.scheduled_req_ids.add(request.request_id)
if request.status == RequestStatus.WAITING: if request.status == RequestStatus.WAITING:
scheduled_new_reqs.append(request) scheduled_new_reqs.append(request)
self.request_scheduled(request, scheduled_timestamp) self.request_scheduled(request, scheduled_timestamp)
@ -519,6 +529,7 @@ class Scheduler:
stop_reason=request.stop_reason, stop_reason=request.stop_reason,
events=request.take_events())) events=request.take_events()))
self.scheduled_req_ids.remove(request.request_id)
if not stopped: if not stopped:
new_running.append(request) new_running.append(request)
@ -575,6 +586,8 @@ class Scheduler:
if request.status == RequestStatus.RUNNING: if request.status == RequestStatus.RUNNING:
self.running.remove(request) self.running.remove(request)
if request.request_id in self.scheduled_req_ids:
self.scheduled_req_ids.remove(request.request_id)
else: else:
self.waiting.remove(request) self.waiting.remove(request)
request.status = finished_status request.status = finished_status
@ -595,6 +608,10 @@ class Scheduler:
def has_unfinished_requests(self) -> bool: def has_unfinished_requests(self) -> bool:
return self.get_num_unfinished_requests() > 0 return self.get_num_unfinished_requests() > 0
def get_num_unscheduled_requests(self) -> int:
"""Number of requests that are not being processed by the executor."""
return self.get_num_unfinished_requests() - len(self.scheduled_req_ids)
def reset_prefix_cache(self) -> bool: def reset_prefix_cache(self) -> bool:
return self.kv_cache_manager.reset_prefix_cache() return self.kv_cache_manager.reset_prefix_cache()

View File

@ -4,8 +4,9 @@ import queue
import signal import signal
import threading import threading
import time import time
from concurrent.futures import Future
from multiprocessing.connection import Connection from multiprocessing.connection import Connection
from typing import Any, List, Tuple, Type from typing import Any, List, Optional, Tuple, Type
import psutil import psutil
import zmq import zmq
@ -18,11 +19,12 @@ from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value) maybe_register_config_serialize_by_value)
from vllm.utils import get_exception_traceback, zmq_socket_ctx from vllm.utils import get_exception_traceback, zmq_socket_ctx
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
from vllm.v1.core.scheduler import Scheduler from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
EngineCoreRequestType) EngineCoreRequestType)
from vllm.v1.engine.mm_input_cache import MMInputCacheServer from vllm.v1.engine.mm_input_cache import MMInputCacheServer
from vllm.v1.executor.abstract import Executor from vllm.v1.executor.abstract import Executor
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus from vllm.v1.request import Request, RequestStatus
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION
@ -66,9 +68,22 @@ class EngineCore:
log_stats=self.log_stats, log_stats=self.log_stats,
) )
# Setup MM Input Mapper.
self.mm_input_cache_server = MMInputCacheServer( self.mm_input_cache_server = MMInputCacheServer(
vllm_config.model_config) vllm_config.model_config)
# Setup batch queue for pipeline parallelism.
# Batch queue for scheduled batches. This enables us to asynchronously
# schedule and execute batches, and is required by pipeline parallelism
# to eliminate pipeline bubbles.
self.batch_queue_size = self.model_executor.max_concurrent_batches
self.batch_queue: Optional[queue.Queue[Tuple[Future[ModelRunnerOutput],
SchedulerOutput]]] = None
if self.batch_queue_size > 1:
logger.info("Batch queue is enabled with size %d",
self.batch_queue_size)
self.batch_queue = queue.Queue(self.batch_queue_size)
def _initialize_kv_caches(self, def _initialize_kv_caches(self,
vllm_config: VllmConfig) -> Tuple[int, int]: vllm_config: VllmConfig) -> Tuple[int, int]:
start = time.time() start = time.time()
@ -135,7 +150,55 @@ class EngineCore:
scheduler_output = self.scheduler.schedule() scheduler_output = self.scheduler.schedule()
output = self.model_executor.execute_model(scheduler_output) output = self.model_executor.execute_model(scheduler_output)
engine_core_outputs = self.scheduler.update_from_output( engine_core_outputs = self.scheduler.update_from_output(
scheduler_output, output) scheduler_output, output) # type: ignore
return engine_core_outputs
def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
"""Schedule and execute batches with the batch queue.
Note that if nothing to output in this step, None is returned.
The execution flow is as follows:
1. Try to schedule a new batch if there are unscheduled requests
and the job queue is not full. If a new batch is scheduled, directly
return an empty engine core output. In other words, we won't check
and return model outputs before the batch queue is full.
2. If there is no new scheduled batch, meaning that the batch queue
is full or no other requests can be scheduled, we block until the first
batch in the job queue is finished.
3. Update the scheduler from the output.
"""
assert self.batch_queue is not None
engine_core_outputs = None
scheduler_output = None
# If there are unscheduled requests and the job queue
# is not full, schedule a new batch. Note that this is not blocking.
if (self.scheduler.get_num_unscheduled_requests() > 0
and not self.batch_queue.full()):
scheduler_output = self.scheduler.schedule()
if scheduler_output.total_num_scheduled_tokens > 0:
future = self.model_executor.execute_model(scheduler_output)
self.batch_queue.put_nowait(
(future, scheduler_output)) # type: ignore
# If all requests are scheduled or the job queue is full,
# block until the first batch in the job queue is finished.
if (scheduler_output is None
or scheduler_output.total_num_scheduled_tokens == 0):
try:
future, scheduler_output = self.batch_queue.get(
timeout=POLLING_TIMEOUT_S)
# Blocking until the first result is available.
model_output = future.result()
self.batch_queue.task_done()
engine_core_outputs = self.scheduler.update_from_output(
scheduler_output, model_output)
except queue.Empty:
# If the queue is empty (timeout at .get), return
# an empty EngineCoreOutputs for logging.
engine_core_outputs = EngineCoreOutputs(
outputs=[], scheduler_stats=self.scheduler.make_stats())
return engine_core_outputs return engine_core_outputs
def shutdown(self): def shutdown(self):
@ -226,6 +289,9 @@ class EngineCoreProc(EngineCore):
def run_busy_loop(self): def run_busy_loop(self):
"""Core busy loop of the EngineCore.""" """Core busy loop of the EngineCore."""
step_fn = (self.step
if self.batch_queue is None else self.step_with_batch_queue)
# Loop until process is sent a SIGINT or SIGTERM # Loop until process is sent a SIGINT or SIGTERM
while True: while True:
# 1) Poll the input queue until there is work to do. # 1) Poll the input queue until there is work to do.
@ -249,9 +315,10 @@ class EngineCoreProc(EngineCore):
self._handle_client_request(*req) self._handle_client_request(*req)
# 3) Step the engine core. # 3) Step the engine core.
outputs = self.step() outputs = step_fn()
# 5) Put EngineCoreOutputs into the output queue. # 4) Put EngineCoreOutputs into the output queue.
if outputs is not None:
self.output_queue.put_nowait(outputs) self.output_queue.put_nowait(outputs)
def _handle_client_request(self, request_type: EngineCoreRequestType, def _handle_client_request(self, request_type: EngineCoreRequestType,

View File

@ -1,11 +1,10 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import List, Type from concurrent.futures import Future
from typing import List, Type, Union
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.executor.executor_base import ExecutorBase from vllm.executor.executor_base import ExecutorBase
from vllm.executor.ray_distributed_executor import ( # noqa
RayDistributedExecutor as RayDistributedExecutorV0)
from vllm.executor.uniproc_executor import ( # noqa from vllm.executor.uniproc_executor import ( # noqa
ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0) ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0)
from vllm.executor.uniproc_executor import ( # noqa from vllm.executor.uniproc_executor import ( # noqa
@ -33,6 +32,8 @@ class Executor(ExecutorBase):
f"ExecutorBase. Got {distributed_executor_backend}.") f"ExecutorBase. Got {distributed_executor_backend}.")
executor_class = distributed_executor_backend executor_class = distributed_executor_backend
elif distributed_executor_backend == "ray": elif distributed_executor_backend == "ray":
from vllm.v1.executor.ray_distributed_executor import ( # noqa
RayDistributedExecutor)
executor_class = RayDistributedExecutor executor_class = RayDistributedExecutor
elif distributed_executor_backend == "mp": elif distributed_executor_backend == "mp":
from vllm.v1.executor.multiproc_executor import MultiprocExecutor from vllm.v1.executor.multiproc_executor import MultiprocExecutor
@ -70,11 +71,15 @@ class Executor(ExecutorBase):
def execute_model( def execute_model(
self, self,
scheduler_output, scheduler_output,
) -> ModelRunnerOutput: ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
output = self.collective_rpc("execute_model", output = self.collective_rpc("execute_model",
args=(scheduler_output, )) args=(scheduler_output, ))
return output[0] return output[0]
@property
def max_concurrent_batches(self) -> int:
return 1
def profile(self, is_start: bool = True): def profile(self, is_start: bool = True):
self.collective_rpc("profile", args=(is_start, )) self.collective_rpc("profile", args=(is_start, ))
@ -85,7 +90,3 @@ class UniProcExecutor(UniProcExecutorV0, Executor):
class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor): class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
pass pass
class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
pass

View File

@ -0,0 +1,61 @@
# SPDX-License-Identifier: Apache-2.0
from concurrent.futures import Future
from typing import Union
from vllm.executor.ray_distributed_executor import ( # noqa
RayDistributedExecutor as RayDistributedExecutorV0)
from vllm.v1.executor.abstract import Executor
from vllm.v1.outputs import ModelRunnerOutput
class FutureWrapper(Future):
"""A wrapper around a Ray output reference to meet the interface
of .execute_model().
"""
def __init__(self, ref):
super().__init__()
self.ref = ref
def result(self, timeout=None):
if timeout is not None:
raise NotImplementedError("timeout is not supported")
return self.ref.get()
class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
"""Ray distributed executor using Ray Compiled Graphs."""
@property
def max_concurrent_batches(self) -> int:
"""Ray distributed executor supports pipeline parallelism,
meaning that it allows PP size batches to be executed concurrently.
"""
return 1 #self.vllm_config.parallel_config.pipeline_parallel_size
def execute_model(
self,
scheduler_output,
) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
"""Execute the model on the Ray workers.
Args:
scheduler_output: The scheduler output to execute.
Returns:
The model runner output.
"""
# Build the compiled DAG for the first time.
if self.forward_dag is None: # type: ignore
self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
refs = self.forward_dag.execute(scheduler_output) # type: ignore
# When PP is not used, we block here until the result is available.
if self.max_concurrent_batches == 1:
return refs[0].get()
# When PP is used, we return a FutureWrapper immediately so that
# the scheduler can yield to the next batch.
return FutureWrapper(refs[0])