diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index cc651a172b40..1b0ea69096cc 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -26,6 +26,11 @@ async def listen_for_disconnect(request: Request) -> None: while True: message = await request.receive() if message["type"] == "http.disconnect": + if request.app.state.enable_server_load_tracking: + # on timeout/cancellation the BackgroundTask in load_aware_call + # cannot decrement the server load metrics. + # Must be decremented by with_cancellation instead. + request.app.state.server_load_metrics -= 1 break