From 86fc2321ff1e1bd2ba15193053bf4515a19bb481 Mon Sep 17 00:00:00 2001 From: Kay Yan Date: Mon, 7 Apr 2025 11:34:51 +0800 Subject: [PATCH] [Metrics] Add bucket for `request_latency`, `time_to_first_token` and `time_per_output_token` (#15202) Signed-off-by: Kay Yan --- vllm/engine/metrics.py | 7 ++++--- vllm/v1/metrics/loggers.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 5890f654e3820..7c4265fac20b0 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -156,7 +156,8 @@ class Metrics: labelnames=labelnames, buckets=[ 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0 + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, + 2560.0 ]) self.histogram_time_per_output_token = self._histogram_cls( name="vllm:time_per_output_token_seconds", @@ -164,14 +165,14 @@ class Metrics: labelnames=labelnames, buckets=[ 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, - 1.0, 2.5 + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 ]) # Request stats # Latency request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, - 40.0, 50.0, 60.0 + 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 ] self.histogram_e2e_time_request = self._histogram_cls( name="vllm:e2e_request_latency_seconds", diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 73883d9a735dd..3959be40b7253 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -239,7 +239,8 @@ class PrometheusStatLogger(StatLoggerBase): documentation="Histogram of time to first token in seconds.", buckets=[ 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0 + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, + 640.0, 2560.0 ], labelnames=labelnames).labels(*labelvalues) @@ -249,13 +250,13 @@ class PrometheusStatLogger(StatLoggerBase): documentation="Histogram of time per output token in seconds.", buckets=[ 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, - 0.75, 1.0, 2.5 + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 ], labelnames=labelnames).labels(*labelvalues) request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, - 40.0, 50.0, 60.0 + 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 ] self.histogram_e2e_time_request = \ prometheus_client.Histogram(