[V1] [6/N] API Server: Better Shutdown (#11586)

This commit is contained in:
Robert Shaw 2024-12-30 10:51:02 -05:00 committed by GitHub
parent 8d9b6721e7
commit 5886aa496e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 40 additions and 45 deletions

View File

@ -68,7 +68,7 @@ from vllm.entrypoints.utils import with_cancellation
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
is_valid_ipv6_address, kill_process_tree, set_ulimit) is_valid_ipv6_address, set_ulimit)
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION
TIMEOUT_KEEP_ALIVE = 5 # seconds TIMEOUT_KEEP_ALIVE = 5 # seconds
@ -133,32 +133,21 @@ async def build_async_engine_client_from_engine_args(
Returns the Client or None if the creation failed. Returns the Client or None if the creation failed.
""" """
# Fall back # AsyncLLMEngine.
# TODO: fill out feature matrix.
if (MQLLMEngineClient.is_unsupported_config(engine_args) if (MQLLMEngineClient.is_unsupported_config(engine_args)
or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
engine_config = engine_args.create_engine_config(
UsageContext.OPENAI_API_SERVER)
uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
"uses_ray", False)
build_engine = partial(AsyncLLMEngine.from_engine_args, engine_client: Optional[EngineClient] = None
engine_args=engine_args, try:
engine_config=engine_config, engine_client = AsyncLLMEngine.from_engine_args(
usage_context=UsageContext.OPENAI_API_SERVER) engine_args=engine_args,
if uses_ray: usage_context=UsageContext.OPENAI_API_SERVER)
# Must run in main thread with ray for its signal handlers to work yield engine_client
engine_client = build_engine() finally:
else: if engine_client and hasattr(engine_client, "shutdown"):
engine_client = await asyncio.get_running_loop().run_in_executor( engine_client.shutdown()
None, build_engine)
yield engine_client # MQLLMEngine.
if hasattr(engine_client, "shutdown"):
engine_client.shutdown()
return
# Otherwise, use the multiprocessing AsyncLLMEngine.
else: else:
if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
# Make TemporaryDirectory for prometheus multiprocessing # Make TemporaryDirectory for prometheus multiprocessing
@ -737,15 +726,6 @@ async def run_server(args, **uvicorn_kwargs) -> None:
signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGTERM, signal_handler)
# The child processes will send SIGQUIT to this process when
# any error happens. This process then clean up the whole tree.
# TODO(rob): move this into AsyncLLM.__init__ once we remove
# the context manager below.
def sigquit_handler(signum, frame):
kill_process_tree(os.getpid())
signal.signal(signal.SIGQUIT, sigquit_handler)
async with build_async_engine_client(args) as engine_client: async with build_async_engine_client(args) as engine_client:
app = build_app(args) app = build_app(args)

View File

@ -1,4 +1,6 @@
import asyncio import asyncio
import os
import signal
from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
@ -16,6 +18,7 @@ from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import kill_process_tree
from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.detokenizer import Detokenizer
from vllm.v1.engine.processor import Processor from vllm.v1.engine.processor import Processor
@ -38,6 +41,22 @@ class AsyncLLM(EngineClient):
log_requests: bool = True, log_requests: bool = True,
start_engine_loop: bool = True, start_engine_loop: bool = True,
) -> None: ) -> None:
# The child processes will send SIGQUIT when unrecoverable
# errors happen. We kill the process tree here so that the
# stack trace is very evident.
# TODO: rather than killing the main process, we should
# figure out how to raise an AsyncEngineDeadError and
# handle at the API server level so we can return a better
# error code to the clients calling VLLM.
def sigquit_handler(signum, frame):
logger.fatal(
"AsyncLLM got SIGQUIT from worker processes, shutting "
"down. See stack trace above for root cause issue.")
kill_process_tree(os.getpid())
signal.signal(signal.SIGQUIT, sigquit_handler)
assert start_engine_loop assert start_engine_loop
self.log_requests = log_requests self.log_requests = log_requests
@ -276,9 +295,9 @@ class AsyncLLM(EngineClient):
# 4) Abort any requests that finished due to stop strings. # 4) Abort any requests that finished due to stop strings.
await self.engine_core.abort_requests_async(reqs_to_abort) await self.engine_core.abort_requests_async(reqs_to_abort)
except BaseException as e: except Exception as e:
logger.error(e) logger.exception("EngineCore output handler hit an error: %s", e)
raise e kill_process_tree(os.getpid())
async def abort(self, request_id: str) -> None: async def abort(self, request_id: str) -> None:
"""Abort RequestId in self, detokenizer, and engine core.""" """Abort RequestId in self, detokenizer, and engine core."""

View File

@ -6,7 +6,7 @@ import zmq.asyncio
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import get_open_zmq_ipc_path from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
EngineCoreProfile, EngineCoreRequest, EngineCoreProfile, EngineCoreRequest,
EngineCoreRequestType, EngineCoreRequestUnion) EngineCoreRequestType, EngineCoreRequestUnion)
@ -144,17 +144,13 @@ class MPClient(EngineCoreClient):
else: else:
self.ctx = zmq.Context() # type: ignore[attr-defined] self.ctx = zmq.Context() # type: ignore[attr-defined]
# Path for IPC. # Paths and sockets for IPC.
output_path = get_open_zmq_ipc_path() output_path = get_open_zmq_ipc_path()
input_path = get_open_zmq_ipc_path() input_path = get_open_zmq_ipc_path()
self.output_socket = make_zmq_socket(self.ctx, output_path,
# Get output (EngineCoreOutput) from EngineCore. zmq.constants.PULL)
self.output_socket = self.ctx.socket(zmq.constants.PULL) self.input_socket = make_zmq_socket(self.ctx, input_path,
self.output_socket.connect(output_path) zmq.constants.PUSH)
# Send input (EngineCoreRequest) to EngineCore.
self.input_socket = self.ctx.socket(zmq.constants.PUSH)
self.input_socket.bind(input_path)
# Start EngineCore in background process. # Start EngineCore in background process.
self.proc_handle: Optional[BackgroundProcHandle] self.proc_handle: Optional[BackgroundProcHandle]