From 8da48e4d95421cbd96fbdecdffed89a3d1aab218 Mon Sep 17 00:00:00 2001
From: Pooya Davoodi <pooya.davoodi@parasail.io>
Date: Fri, 23 Aug 2024 23:04:22 -0700
Subject: [PATCH] [Frontend] Publish  Prometheus metrics in run_batch API
 (#7641)

---
 tests/entrypoints/openai/test_metrics.py | 49 ++++++++++++++++++++++++
 vllm/entrypoints/openai/run_batch.py     | 27 +++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index cbe601e623056..042c3730e09f5 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -1,3 +1,7 @@
+import subprocess
+import sys
+import tempfile
+import time
 from http import HTTPStatus
 
 import openai
@@ -177,3 +181,48 @@ async def test_metrics_exist(client: openai.AsyncOpenAI):
 
     for metric in EXPECTED_METRICS:
         assert metric in response.text
+
+
+def test_metrics_exist_run_batch():
+    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
+
+    base_url = "0.0.0.0"
+    port = "8001"
+    server_url = f"http://{base_url}:{port}"
+
+    with tempfile.NamedTemporaryFile(
+            "w") as input_file, tempfile.NamedTemporaryFile(
+                "r") as output_file:
+        input_file.write(input_batch)
+        input_file.flush()
+        proc = subprocess.Popen([
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.run_batch",
+            "-i",
+            input_file.name,
+            "-o",
+            output_file.name,
+            "--model",
+            "intfloat/e5-mistral-7b-instruct",
+            "--enable-metrics",
+            "--url",
+            base_url,
+            "--port",
+            port,
+        ], )
+
+        def is_server_up(url):
+            try:
+                response = requests.get(url)
+                return response.status_code == 200
+            except requests.ConnectionError:
+                return False
+
+        while not is_server_up(server_url):
+            time.sleep(1)
+
+        response = requests.get(server_url + "/metrics")
+        assert response.status_code == HTTPStatus.OK
+
+        proc.wait()
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 764712fd5648b..32bbade256973 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -3,6 +3,7 @@ from io import StringIO
 from typing import Awaitable, Callable, List
 
 import aiohttp
+from prometheus_client import start_http_server
 
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -56,6 +57,24 @@ def parse_args():
                         'ID numbers being printed in log.'
                         '\n\nDefault: Unlimited')
 
+    parser.add_argument("--enable-metrics",
+                        action="store_true",
+                        help="Enable Prometheus metrics")
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="0.0.0.0",
+        help="URL to the Prometheus metrics server "
+        "(only needed if enable-metrics is set).",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Port number for the Prometheus metrics server "
+        "(only needed if enable-metrics is set).",
+    )
+
     return parser.parse_args()
 
 
@@ -184,4 +203,12 @@ if __name__ == "__main__":
     logger.info("vLLM batch processing API version %s", VLLM_VERSION)
     logger.info("args: %s", args)
 
+    # Start the Prometheus metrics server. LLMEngine uses the Prometheus client
+    # to publish metrics at the /metrics endpoint.
+    if args.enable_metrics:
+        logger.info("Prometheus metrics enabled")
+        start_http_server(port=args.port, addr=args.url)
+    else:
+        logger.info("Prometheus metrics disabled")
+
     asyncio.run(main(args))