[V1][Metrics] Hook up IterationStats for Prometheus metrics (#12478)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2026-01-28 06:07:15 +08:00 · 2025-01-28 16:38:38 +00:00 · 2025-01-28 16:38:38 +00:00 · 3fd1fb63ef
commit 3fd1fb63ef
parent 925d2f1908
3 changed files with 66 additions and 12 deletions
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@ -105,8 +105,6 @@ EXPECTED_VALUES = {
@pytest.mark.asyncio
 async def test_metrics_counts(server: RemoteOpenAIServer,
                              client: openai.AsyncClient, use_v1: bool):
-    if use_v1:
-        pytest.skip("Skipping test on vllm V1")
    for _ in range(_NUM_REQUESTS):
        # sending a request triggers the metrics to be logged.
        await client.completions.create(
@ -120,6 +118,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,

    # Loop over all expected metric_families
    for metric_family, suffix_values_list in EXPECTED_VALUES.items():
+        if use_v1 and metric_family not in EXPECTED_METRICS_V1:
+            continue
+
        found_metric = False

        # Check to see if the metric_family is found in the prom endpoint.
@ -199,6 +200,8 @@ EXPECTED_METRICS = [
 EXPECTED_METRICS_V1 = [
    "vllm:num_requests_running",
    "vllm:num_requests_waiting",
+    "vllm:prompt_tokens_total",
+    "vllm:generation_tokens_total",
 ]


--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@ -305,7 +305,8 @@ class AsyncLLM(EngineClient):
            return

        for logger in self.stat_loggers:
-            logger.log(scheduler_stats=scheduler_stats)
+            logger.log(scheduler_stats=scheduler_stats,
+                       iteration_stats=iteration_stats)

    def encode(
        self,
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@ -1,11 +1,12 @@
 import time
 from abc import ABC, abstractmethod
-from typing import Dict
+from typing import Dict, List

+import numpy as np
 import prometheus_client

 from vllm.logger import init_logger
-from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.metrics.stats import IterationStats, SchedulerStats

 logger = init_logger(__name__)

@ -15,27 +16,61 @@ _LOCAL_LOGGING_INTERVAL_SEC = 5.0
 class StatLoggerBase(ABC):

    @abstractmethod
-    def log(self, scheduler_stats: SchedulerStats):
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
        ...


 class LoggingStatLogger(StatLoggerBase):

    def __init__(self):
-        self.last_log_time = time.monotonic()
+        self._reset(time.monotonic())

-    def log(self, scheduler_stats: SchedulerStats):
+    def _reset(self, now):
+        self.last_log_time = now
+
+        # Tracked stats over current local logging interval.
+        self.num_prompt_tokens: List[int] = []
+        self.num_generation_tokens: List[int] = []
+
+    def _local_interval_elapsed(self, now: float) -> bool:
+        # Log every _LOCAL_LOGGING_INTERVAL_SEC.
+        elapsed_time = now - self.last_log_time
+        return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC
+
+    def _track_iteration_stats(self, iteration_stats: IterationStats):
+        # Save tracked stats for token counters.
+        self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
+        self.num_generation_tokens.append(
+            iteration_stats.num_generation_tokens)
+
+    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
+        # Compute summary metrics for tracked stats
+        return float(np.sum(tracked_stats) / (now - self.last_log_time))
+
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
        """Log Stats to standard output."""

-        # Log every _LOCAL_LOGGING_INTERVAL_SEC.
+        self._track_iteration_stats(iteration_stats)
+
        now = time.monotonic()
-        if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC:
+        if not self._local_interval_elapsed(now):
            return
-        self.last_log_time = now
+
+        prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
+        generation_throughput = self._get_throughput(
+            self.num_generation_tokens, now)
+
+        self._reset(now)

        # Format and print output.
        logger.info(
+            "Avg prompt throughput: %.1f tokens/s, "
+            "Avg generation throughput: %.1f tokens/s, "
            "Running: %d reqs, Waiting: %d reqs ",
+            prompt_throughput,
+            generation_throughput,
            scheduler_stats.num_running_reqs,
            scheduler_stats.num_waiting_reqs,
        )
@ -61,11 +96,26 @@ class PrometheusStatLogger(StatLoggerBase):
            documentation="Number of requests waiting to be processed.",
            labelnames=labelnames).labels(*labelvalues)

-    def log(self, scheduler_stats: SchedulerStats):
+        self.counter_prompt_tokens = prometheus_client.Counter(
+            name="vllm:prompt_tokens_total",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.counter_generation_tokens = prometheus_client.Counter(
+            name="vllm:generation_tokens_total",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames).labels(*labelvalues)
+
+    def log(self, scheduler_stats: SchedulerStats,
+            iteration_stats: IterationStats):
        """Log to prometheus."""
        self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
        self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)

+        self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
+        self.counter_generation_tokens.inc(
+            iteration_stats.num_generation_tokens)
+
    @staticmethod
    def _unregister_vllm_metrics():
        # Unregister any existing vLLM collectors (for CI/CD