mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-21 00:57:02 +08:00
[V1] [6/N] API Server: Better Shutdown (#11586)
This commit is contained in:
parent
8d9b6721e7
commit
5886aa496e
@ -68,7 +68,7 @@ from vllm.entrypoints.utils import with_cancellation
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
|
from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
|
||||||
is_valid_ipv6_address, kill_process_tree, set_ulimit)
|
is_valid_ipv6_address, set_ulimit)
|
||||||
from vllm.version import __version__ as VLLM_VERSION
|
from vllm.version import __version__ as VLLM_VERSION
|
||||||
|
|
||||||
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
TIMEOUT_KEEP_ALIVE = 5 # seconds
|
||||||
@ -133,32 +133,21 @@ async def build_async_engine_client_from_engine_args(
|
|||||||
Returns the Client or None if the creation failed.
|
Returns the Client or None if the creation failed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Fall back
|
# AsyncLLMEngine.
|
||||||
# TODO: fill out feature matrix.
|
|
||||||
if (MQLLMEngineClient.is_unsupported_config(engine_args)
|
if (MQLLMEngineClient.is_unsupported_config(engine_args)
|
||||||
or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
|
or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
|
||||||
engine_config = engine_args.create_engine_config(
|
|
||||||
UsageContext.OPENAI_API_SERVER)
|
|
||||||
uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
|
|
||||||
"uses_ray", False)
|
|
||||||
|
|
||||||
build_engine = partial(AsyncLLMEngine.from_engine_args,
|
engine_client: Optional[EngineClient] = None
|
||||||
engine_args=engine_args,
|
try:
|
||||||
engine_config=engine_config,
|
engine_client = AsyncLLMEngine.from_engine_args(
|
||||||
usage_context=UsageContext.OPENAI_API_SERVER)
|
engine_args=engine_args,
|
||||||
if uses_ray:
|
usage_context=UsageContext.OPENAI_API_SERVER)
|
||||||
# Must run in main thread with ray for its signal handlers to work
|
yield engine_client
|
||||||
engine_client = build_engine()
|
finally:
|
||||||
else:
|
if engine_client and hasattr(engine_client, "shutdown"):
|
||||||
engine_client = await asyncio.get_running_loop().run_in_executor(
|
engine_client.shutdown()
|
||||||
None, build_engine)
|
|
||||||
|
|
||||||
yield engine_client
|
# MQLLMEngine.
|
||||||
if hasattr(engine_client, "shutdown"):
|
|
||||||
engine_client.shutdown()
|
|
||||||
return
|
|
||||||
|
|
||||||
# Otherwise, use the multiprocessing AsyncLLMEngine.
|
|
||||||
else:
|
else:
|
||||||
if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
|
if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
|
||||||
# Make TemporaryDirectory for prometheus multiprocessing
|
# Make TemporaryDirectory for prometheus multiprocessing
|
||||||
@ -737,15 +726,6 @@ async def run_server(args, **uvicorn_kwargs) -> None:
|
|||||||
|
|
||||||
signal.signal(signal.SIGTERM, signal_handler)
|
signal.signal(signal.SIGTERM, signal_handler)
|
||||||
|
|
||||||
# The child processes will send SIGQUIT to this process when
|
|
||||||
# any error happens. This process then clean up the whole tree.
|
|
||||||
# TODO(rob): move this into AsyncLLM.__init__ once we remove
|
|
||||||
# the context manager below.
|
|
||||||
def sigquit_handler(signum, frame):
|
|
||||||
kill_process_tree(os.getpid())
|
|
||||||
|
|
||||||
signal.signal(signal.SIGQUIT, sigquit_handler)
|
|
||||||
|
|
||||||
async with build_async_engine_client(args) as engine_client:
|
async with build_async_engine_client(args) as engine_client:
|
||||||
app = build_app(args)
|
app = build_app(args)
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
|
from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
|
||||||
|
|
||||||
from vllm.config import ModelConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
@ -16,6 +18,7 @@ from vllm.sampling_params import SamplingParams
|
|||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
|
from vllm.utils import kill_process_tree
|
||||||
from vllm.v1.engine.core_client import EngineCoreClient
|
from vllm.v1.engine.core_client import EngineCoreClient
|
||||||
from vllm.v1.engine.detokenizer import Detokenizer
|
from vllm.v1.engine.detokenizer import Detokenizer
|
||||||
from vllm.v1.engine.processor import Processor
|
from vllm.v1.engine.processor import Processor
|
||||||
@ -38,6 +41,22 @@ class AsyncLLM(EngineClient):
|
|||||||
log_requests: bool = True,
|
log_requests: bool = True,
|
||||||
start_engine_loop: bool = True,
|
start_engine_loop: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
|
# The child processes will send SIGQUIT when unrecoverable
|
||||||
|
# errors happen. We kill the process tree here so that the
|
||||||
|
# stack trace is very evident.
|
||||||
|
# TODO: rather than killing the main process, we should
|
||||||
|
# figure out how to raise an AsyncEngineDeadError and
|
||||||
|
# handle at the API server level so we can return a better
|
||||||
|
# error code to the clients calling VLLM.
|
||||||
|
def sigquit_handler(signum, frame):
|
||||||
|
logger.fatal(
|
||||||
|
"AsyncLLM got SIGQUIT from worker processes, shutting "
|
||||||
|
"down. See stack trace above for root cause issue.")
|
||||||
|
kill_process_tree(os.getpid())
|
||||||
|
|
||||||
|
signal.signal(signal.SIGQUIT, sigquit_handler)
|
||||||
|
|
||||||
assert start_engine_loop
|
assert start_engine_loop
|
||||||
|
|
||||||
self.log_requests = log_requests
|
self.log_requests = log_requests
|
||||||
@ -276,9 +295,9 @@ class AsyncLLM(EngineClient):
|
|||||||
# 4) Abort any requests that finished due to stop strings.
|
# 4) Abort any requests that finished due to stop strings.
|
||||||
await self.engine_core.abort_requests_async(reqs_to_abort)
|
await self.engine_core.abort_requests_async(reqs_to_abort)
|
||||||
|
|
||||||
except BaseException as e:
|
except Exception as e:
|
||||||
logger.error(e)
|
logger.exception("EngineCore output handler hit an error: %s", e)
|
||||||
raise e
|
kill_process_tree(os.getpid())
|
||||||
|
|
||||||
async def abort(self, request_id: str) -> None:
|
async def abort(self, request_id: str) -> None:
|
||||||
"""Abort RequestId in self, detokenizer, and engine core."""
|
"""Abort RequestId in self, detokenizer, and engine core."""
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import zmq.asyncio
|
|||||||
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import get_open_zmq_ipc_path
|
from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
|
||||||
from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
|
from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
|
||||||
EngineCoreProfile, EngineCoreRequest,
|
EngineCoreProfile, EngineCoreRequest,
|
||||||
EngineCoreRequestType, EngineCoreRequestUnion)
|
EngineCoreRequestType, EngineCoreRequestUnion)
|
||||||
@ -144,17 +144,13 @@ class MPClient(EngineCoreClient):
|
|||||||
else:
|
else:
|
||||||
self.ctx = zmq.Context() # type: ignore[attr-defined]
|
self.ctx = zmq.Context() # type: ignore[attr-defined]
|
||||||
|
|
||||||
# Path for IPC.
|
# Paths and sockets for IPC.
|
||||||
output_path = get_open_zmq_ipc_path()
|
output_path = get_open_zmq_ipc_path()
|
||||||
input_path = get_open_zmq_ipc_path()
|
input_path = get_open_zmq_ipc_path()
|
||||||
|
self.output_socket = make_zmq_socket(self.ctx, output_path,
|
||||||
# Get output (EngineCoreOutput) from EngineCore.
|
zmq.constants.PULL)
|
||||||
self.output_socket = self.ctx.socket(zmq.constants.PULL)
|
self.input_socket = make_zmq_socket(self.ctx, input_path,
|
||||||
self.output_socket.connect(output_path)
|
zmq.constants.PUSH)
|
||||||
|
|
||||||
# Send input (EngineCoreRequest) to EngineCore.
|
|
||||||
self.input_socket = self.ctx.socket(zmq.constants.PUSH)
|
|
||||||
self.input_socket.bind(input_path)
|
|
||||||
|
|
||||||
# Start EngineCore in background process.
|
# Start EngineCore in background process.
|
||||||
self.proc_handle: Optional[BackgroundProcHandle]
|
self.proc_handle: Optional[BackgroundProcHandle]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user