From 2417798471af8521e488c04f3a43c91e6836a705 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Tue, 2 Sep 2025 19:10:10 +0100 Subject: [PATCH] [Metrics] Deprecate TPOT in favor of ITL (#24110) Signed-off-by: Mark McLoughlin --- .../prometheus_grafana/grafana.json | 12 +++++----- tests/entrypoints/openai/test_metrics.py | 22 +++++++++++++------ vllm/engine/llm_engine.py | 8 +++---- vllm/engine/metrics.py | 18 +++++++++++++-- vllm/engine/metrics_types.py | 2 +- vllm/v1/metrics/loggers.py | 22 ++++++++++++++++--- vllm/v1/metrics/stats.py | 6 ++--- 7 files changed, 64 insertions(+), 26 deletions(-) diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json index 3488956a5b24c..37abc9de926fd 100644 --- a/examples/online_serving/prometheus_grafana/grafana.json +++ b/examples/online_serving/prometheus_grafana/grafana.json @@ -402,7 +402,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -418,7 +418,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -435,7 +435,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -452,7 +452,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -468,7 +468,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Mean", @@ -476,7 +476,7 @@ "refId": "E" } ], - "title": "Time Per Output Token Latency", + "title": "Inter Token Latency", "type": "timeseries" }, { diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index ff2e7004ff9f8..a4e1aca8bcac2 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [ "vllm:request_params_max_tokens_sum", "vllm:request_params_max_tokens_bucket", "vllm:request_params_max_tokens_count", - "vllm:time_to_first_token_seconds_sum", - "vllm:time_to_first_token_seconds_bucket", - "vllm:time_to_first_token_seconds_count", "vllm:time_per_output_token_seconds_sum", "vllm:time_per_output_token_seconds_bucket", "vllm:time_per_output_token_seconds_count", + "vllm:time_to_first_token_seconds_sum", + "vllm:time_to_first_token_seconds_bucket", + "vllm:time_to_first_token_seconds_count", + "vllm:inter_token_latency_seconds_sum", + "vllm:inter_token_latency_seconds_bucket", + "vllm:inter_token_latency_seconds_count", "vllm:e2e_request_latency_seconds_sum", "vllm:e2e_request_latency_seconds_bucket", "vllm:e2e_request_latency_seconds_count", @@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [ "vllm:request_decode_time_seconds_count", ] -HIDDEN_DEPRECATED_METRICS: list[str] = [] +HIDDEN_DEPRECATED_METRICS: list[str] = [ + "vllm:time_per_output_token_seconds_sum", + "vllm:time_per_output_token_seconds_bucket", + "vllm:time_per_output_token_seconds_count", +] @pytest.mark.asyncio @@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer, assert response.status_code == HTTPStatus.OK for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS): - if (not server.show_hidden_metrics - and metric not in HIDDEN_DEPRECATED_METRICS): - assert metric in response.text + if (metric in HIDDEN_DEPRECATED_METRICS + and not server.show_hidden_metrics): + continue + assert metric in response.text @pytest.mark.asyncio diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 10ded6f16d41c..47f56e58130fa 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1414,7 +1414,7 @@ class LLMEngine: num_generation_tokens_iter = 0 num_tokens_iter = 0 time_to_first_tokens_iter: List[float] = [] - time_per_output_tokens_iter: List[float] = [] + inter_token_latencies_iter: List[float] = [] num_preemption_iter = (0 if scheduler_outputs is None else scheduler_outputs.preempted) @@ -1498,9 +1498,9 @@ class LLMEngine: num_generation_tokens_from_prefill_groups += ( seq_group.num_seqs()) else: - # TPOTs. + # ITLs latency = seq_group.get_last_token_latency() - time_per_output_tokens_iter.append(latency) + inter_token_latencies_iter.append(latency) if seq_group.state.current_step == 0: # For async_output_proc, the do_log_stats() # is called following init_multi_step(), which @@ -1582,7 +1582,7 @@ class LLMEngine: num_generation_tokens_iter=num_generation_tokens_iter, num_tokens_iter=num_tokens_iter, time_to_first_tokens_iter=time_to_first_tokens_iter, - time_per_output_tokens_iter=time_per_output_tokens_iter, + inter_token_latencies_iter=inter_token_latencies_iter, num_preemption_iter=num_preemption_iter, # Request stats diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index ba8dbd1fad791..0a8709db40880 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -113,9 +113,21 @@ class Metrics: 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0 ]) + # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds + # TODO: in 0.12, only enable if show_hidden_metrics=True self.histogram_time_per_output_token = self._histogram_cls( name="vllm:time_per_output_token_seconds", - documentation="Histogram of time per output token in seconds.", + documentation=( + "Histogram of time per output token in seconds." + "DEPRECATED: Use vllm:inter_token_latency_seconds instead."), + labelnames=labelnames, + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ]) + self.histogram_inter_token_latency = self._histogram_cls( + name="vllm:inter_token_latency_seconds", + documentation="Histogram of inter token latency in seconds.", labelnames=labelnames, buckets=[ 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, @@ -491,7 +503,9 @@ class PrometheusStatLogger(StatLoggerBase): self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter) self._log_histogram(self.metrics.histogram_time_per_output_token, - stats.time_per_output_tokens_iter) + stats.inter_token_latencies_iter) + self._log_histogram(self.metrics.histogram_inter_token_latency, + stats.inter_token_latencies_iter) # Request level data # Latency diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 3281a9121a9df..9778ab5a8c99b 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -43,7 +43,7 @@ class Stats: num_generation_tokens_iter: int num_tokens_iter: int time_to_first_tokens_iter: List[float] - time_per_output_tokens_iter: List[float] + inter_token_latencies_iter: List[float] num_preemption_iter: int # Request stats (should have _requests suffix) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 3b0616952babf..41e07a00564aa 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -377,9 +377,13 @@ class PrometheusStatLogger(StatLoggerBase): self.histogram_time_to_first_token = make_per_engine( histogram_time_to_first_token, engine_indexes, model_name) + # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds + # TODO: in 0.12, only enable if show_hidden_metrics=True histogram_time_per_output_token = self._histogram_cls( name="vllm:time_per_output_token_seconds", - documentation="Histogram of time per output token in seconds.", + documentation=( + "Histogram of time per output token in seconds." + "DEPRECATED: Use vllm:inter_token_latency_seconds instead."), buckets=[ 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 @@ -388,6 +392,17 @@ class PrometheusStatLogger(StatLoggerBase): self.histogram_time_per_output_token = make_per_engine( histogram_time_per_output_token, engine_indexes, model_name) + histogram_inter_token_latency = self._histogram_cls( + name="vllm:inter_token_latency_seconds", + documentation="Histogram of inter-token latency in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ], + labelnames=labelnames) + self.histogram_inter_token_latency = make_per_engine( + histogram_inter_token_latency, engine_indexes, model_name) + request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 @@ -537,8 +552,9 @@ class PrometheusStatLogger(StatLoggerBase): self.histogram_n_request[engine_idx].observe(n_param) for ttft in iteration_stats.time_to_first_tokens_iter: self.histogram_time_to_first_token[engine_idx].observe(ttft) - for tpot in iteration_stats.time_per_output_tokens_iter: - self.histogram_time_per_output_token[engine_idx].observe(tpot) + for itl in iteration_stats.inter_token_latencies_iter: + self.histogram_inter_token_latency[engine_idx].observe(itl) + self.histogram_time_per_output_token[engine_idx].observe(itl) for finished_request in iteration_stats.finished_requests: self.counter_request_success[ diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 95094bda65cde..45c32aaaaf6c4 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -96,7 +96,7 @@ class IterationStats: self.max_num_generation_tokens_iter: list[int] = [] self.n_params_iter: list[int] = [] self.time_to_first_tokens_iter: list[float] = [] - self.time_per_output_tokens_iter: list[float] = [] + self.inter_token_latencies_iter: list[float] = [] self.waiting_lora_adapters: dict[str, int] = {} self.running_lora_adapters: dict[str, int] = {} @@ -128,8 +128,8 @@ class IterationStats: if is_prefilling: req_stats.first_token_ts = engine_core_timestamp else: - tpot = engine_core_timestamp - req_stats.last_token_ts - self.time_per_output_tokens_iter.append(tpot) + itl = engine_core_timestamp - req_stats.last_token_ts + self.inter_token_latencies_iter.append(itl) req_stats.last_token_ts = engine_core_timestamp