[Frontend] Publish Prometheus metrics in run_batch API (#7641)

2025-12-22 20:45:01 +08:00 · 2024-08-23 23:04:22 -07:00 · 2024-08-23 23:04:22 -07:00 · 8da48e4d95
commit 8da48e4d95
parent 6885fde317
2 changed files with 76 additions and 0 deletions
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@ -1,3 +1,7 @@
 import subprocess
 import sys
 import tempfile
 import time
 from http import HTTPStatus
 import openai
@ -177,3 +181,48 @@ async def test_metrics_exist(client: openai.AsyncOpenAI):
    for metric in EXPECTED_METRICS:
        assert metric in response.text
 def test_metrics_exist_run_batch():
    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
    base_url = "0.0.0.0"
    port = "8001"
    server_url = f"http://{base_url}:{port}"
    with tempfile.NamedTemporaryFile(
            "w") as input_file, tempfile.NamedTemporaryFile(
                "r") as output_file:
        input_file.write(input_batch)
        input_file.flush()
        proc = subprocess.Popen([
            sys.executable,
            "-m",
            "vllm.entrypoints.openai.run_batch",
            "-i",
            input_file.name,
            "-o",
            output_file.name,
            "--model",
            "intfloat/e5-mistral-7b-instruct",
            "--enable-metrics",
            "--url",
            base_url,
            "--port",
            port,
        ], )
        def is_server_up(url):
            try:
                response = requests.get(url)
                return response.status_code == 200
            except requests.ConnectionError:
                return False
        while not is_server_up(server_url):
            time.sleep(1)
        response = requests.get(server_url + "/metrics")
        assert response.status_code == HTTPStatus.OK
        proc.wait()
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@ -3,6 +3,7 @@ from io import StringIO
 from typing import Awaitable, Callable, List
 import aiohttp
 from prometheus_client import start_http_server
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@ -56,6 +57,24 @@ def parse_args():
                        'ID numbers being printed in log.'
                        '\n\nDefault: Unlimited')
    parser.add_argument("--enable-metrics",
                        action="store_true",
                        help="Enable Prometheus metrics")
    parser.add_argument(
        "--url",
        type=str,
        default="0.0.0.0",
        help="URL to the Prometheus metrics server "
        "(only needed if enable-metrics is set).",
    )
    parser.add_argument(
        "--port",
        type=int,
        default=8000,
        help="Port number for the Prometheus metrics server "
        "(only needed if enable-metrics is set).",
    )
    return parser.parse_args()
@ -184,4 +203,12 @@ if __name__ == "__main__":
    logger.info("vLLM batch processing API version %s", VLLM_VERSION)
    logger.info("args: %s", args)
    # Start the Prometheus metrics server. LLMEngine uses the Prometheus client
    # to publish metrics at the /metrics endpoint.
    if args.enable_metrics:
        logger.info("Prometheus metrics enabled")
        start_http_server(port=args.port, addr=args.url)
    else:
        logger.info("Prometheus metrics disabled")
    asyncio.run(main(args))