From 8da48e4d95421cbd96fbdecdffed89a3d1aab218 Mon Sep 17 00:00:00 2001 From: Pooya Davoodi Date: Fri, 23 Aug 2024 23:04:22 -0700 Subject: [PATCH] [Frontend] Publish Prometheus metrics in run_batch API (#7641) --- tests/entrypoints/openai/test_metrics.py | 49 ++++++++++++++++++++++++ vllm/entrypoints/openai/run_batch.py | 27 +++++++++++++ 2 files changed, 76 insertions(+) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index cbe601e623056..042c3730e09f5 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -1,3 +1,7 @@ +import subprocess +import sys +import tempfile +import time from http import HTTPStatus import openai @@ -177,3 +181,48 @@ async def test_metrics_exist(client: openai.AsyncOpenAI): for metric in EXPECTED_METRICS: assert metric in response.text + + +def test_metrics_exist_run_batch(): + input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501 + + base_url = "0.0.0.0" + port = "8001" + server_url = f"http://{base_url}:{port}" + + with tempfile.NamedTemporaryFile( + "w") as input_file, tempfile.NamedTemporaryFile( + "r") as output_file: + input_file.write(input_batch) + input_file.flush() + proc = subprocess.Popen([ + sys.executable, + "-m", + "vllm.entrypoints.openai.run_batch", + "-i", + input_file.name, + "-o", + output_file.name, + "--model", + "intfloat/e5-mistral-7b-instruct", + "--enable-metrics", + "--url", + base_url, + "--port", + port, + ], ) + + def is_server_up(url): + try: + response = requests.get(url) + return response.status_code == 200 + except requests.ConnectionError: + return False + + while not is_server_up(server_url): + time.sleep(1) + + response = requests.get(server_url + "/metrics") + assert response.status_code == HTTPStatus.OK + + proc.wait() diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 764712fd5648b..32bbade256973 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -3,6 +3,7 @@ from io import StringIO from typing import Awaitable, Callable, List import aiohttp +from prometheus_client import start_http_server from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -56,6 +57,24 @@ def parse_args(): 'ID numbers being printed in log.' '\n\nDefault: Unlimited') + parser.add_argument("--enable-metrics", + action="store_true", + help="Enable Prometheus metrics") + parser.add_argument( + "--url", + type=str, + default="0.0.0.0", + help="URL to the Prometheus metrics server " + "(only needed if enable-metrics is set).", + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="Port number for the Prometheus metrics server " + "(only needed if enable-metrics is set).", + ) + return parser.parse_args() @@ -184,4 +203,12 @@ if __name__ == "__main__": logger.info("vLLM batch processing API version %s", VLLM_VERSION) logger.info("args: %s", args) + # Start the Prometheus metrics server. LLMEngine uses the Prometheus client + # to publish metrics at the /metrics endpoint. + if args.enable_metrics: + logger.info("Prometheus metrics enabled") + start_http_server(port=args.port, addr=args.url) + else: + logger.info("Prometheus metrics disabled") + asyncio.run(main(args))