mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 22:36:03 +08:00
[Frontend] Improve Startup Failure UX (#7716)
This commit is contained in:
parent
91f4522cbf
commit
970dfdc01d
@ -1,3 +1,5 @@
|
|||||||
|
import time
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.entrypoints.openai.api_server import build_async_engine_client
|
from vllm.entrypoints.openai.api_server import build_async_engine_client
|
||||||
@ -8,19 +10,20 @@ from vllm.utils import FlexibleArgumentParser
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_mp_crash_detection():
|
async def test_mp_crash_detection():
|
||||||
|
|
||||||
with pytest.raises(RuntimeError) as excinfo:
|
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
|
||||||
parser = FlexibleArgumentParser(
|
|
||||||
description="vLLM's remote OpenAI server.")
|
|
||||||
parser = make_arg_parser(parser)
|
parser = make_arg_parser(parser)
|
||||||
args = parser.parse_args([])
|
args = parser.parse_args([])
|
||||||
# use an invalid tensor_parallel_size to trigger the
|
# use an invalid tensor_parallel_size to trigger the
|
||||||
# error in the server
|
# error in the server
|
||||||
args.tensor_parallel_size = 65536
|
args.tensor_parallel_size = 65536
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
async with build_async_engine_client(args):
|
async with build_async_engine_client(args):
|
||||||
pass
|
pass
|
||||||
assert "The server process died before responding to the readiness probe"\
|
end = time.perf_counter()
|
||||||
in str(excinfo.value)
|
|
||||||
|
assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
|
||||||
|
"if there is an error in the startup.")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import tempfile
|
|||||||
from argparse import Namespace
|
from argparse import Namespace
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import AsyncIterator, Set
|
from typing import AsyncIterator, Optional, Set
|
||||||
|
|
||||||
from fastapi import APIRouter, FastAPI, Request
|
from fastapi import APIRouter, FastAPI, Request
|
||||||
from fastapi.exceptions import RequestValidationError
|
from fastapi.exceptions import RequestValidationError
|
||||||
@ -60,6 +60,7 @@ openai_serving_embedding: OpenAIServingEmbedding
|
|||||||
openai_serving_tokenization: OpenAIServingTokenization
|
openai_serving_tokenization: OpenAIServingTokenization
|
||||||
prometheus_multiproc_dir: tempfile.TemporaryDirectory
|
prometheus_multiproc_dir: tempfile.TemporaryDirectory
|
||||||
|
|
||||||
|
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
|
||||||
logger = init_logger('vllm.entrypoints.openai.api_server')
|
logger = init_logger('vllm.entrypoints.openai.api_server')
|
||||||
|
|
||||||
_running_tasks: Set[asyncio.Task] = set()
|
_running_tasks: Set[asyncio.Task] = set()
|
||||||
@ -94,7 +95,15 @@ async def lifespan(app: FastAPI):
|
|||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def build_async_engine_client(
|
async def build_async_engine_client(
|
||||||
args: Namespace) -> AsyncIterator[AsyncEngineClient]:
|
args: Namespace) -> AsyncIterator[Optional[AsyncEngineClient]]:
|
||||||
|
"""
|
||||||
|
Create AsyncEngineClient, either:
|
||||||
|
- in-process using the AsyncLLMEngine Directly
|
||||||
|
- multiprocess using AsyncLLMEngine RPC
|
||||||
|
|
||||||
|
Returns the Client or None if the creation failed.
|
||||||
|
"""
|
||||||
|
|
||||||
# Context manager to handle async_engine_client lifecycle
|
# Context manager to handle async_engine_client lifecycle
|
||||||
# Ensures everything is shutdown and cleaned up on error/exit
|
# Ensures everything is shutdown and cleaned up on error/exit
|
||||||
global engine_args
|
global engine_args
|
||||||
@ -157,11 +166,13 @@ async def build_async_engine_client(
|
|||||||
try:
|
try:
|
||||||
await rpc_client.setup()
|
await rpc_client.setup()
|
||||||
break
|
break
|
||||||
except TimeoutError as e:
|
except TimeoutError:
|
||||||
if not rpc_server_process.is_alive():
|
if not rpc_server_process.is_alive():
|
||||||
raise RuntimeError(
|
logger.error(
|
||||||
"The server process died before "
|
"RPCServer process died before responding "
|
||||||
"responding to the readiness probe") from e
|
"to readiness probe")
|
||||||
|
yield None
|
||||||
|
return
|
||||||
|
|
||||||
yield async_engine_client
|
yield async_engine_client
|
||||||
finally:
|
finally:
|
||||||
@ -410,6 +421,10 @@ async def run_server(args, **uvicorn_kwargs) -> None:
|
|||||||
logger.info("args: %s", args)
|
logger.info("args: %s", args)
|
||||||
|
|
||||||
async with build_async_engine_client(args) as async_engine_client:
|
async with build_async_engine_client(args) as async_engine_client:
|
||||||
|
# If None, creation of the client failed and we exit.
|
||||||
|
if async_engine_client is None:
|
||||||
|
return
|
||||||
|
|
||||||
app = await init_app(async_engine_client, args)
|
app = await init_app(async_engine_client, args)
|
||||||
|
|
||||||
shutdown_task = await serve_http(
|
shutdown_task = await serve_http(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user