From 67244c86f0f1ffc06fcab9cad5e78989695cc15f Mon Sep 17 00:00:00 2001 From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com> Date: Thu, 18 Sep 2025 22:29:40 +0800 Subject: [PATCH] feat(api): Return 503 on /health when engine is dead (#24897) Signed-off-by: dongbo910220 <1275604947@qq.com> Co-authored-by: Claude --- vllm/entrypoints/openai/api_server.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 93ea846f26f6..912e66412092 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -103,6 +103,7 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs, is_valid_ipv6_address, set_ulimit) +from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -351,8 +352,11 @@ def engine_client(request: Request) -> EngineClient: @router.get("/health", response_class=Response) async def health(raw_request: Request) -> Response: """Health check.""" - await engine_client(raw_request).check_health() - return Response(status_code=200) + try: + await engine_client(raw_request).check_health() + return Response(status_code=200) + except EngineDeadError: + return Response(status_code=503) @router.get("/load")