[Metrics] Deprecate TPOT in favor of ITL (#24110)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
Mark McLoughlin 2025-09-02 19:10:10 +01:00 committed by GitHub
parent 9480ae24e3
commit 2417798471
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 64 additions and 26 deletions

View File

@ -402,7 +402,7 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false,
"includeNullMetadata": false,
"instant": false,
@ -418,7 +418,7 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
@ -435,7 +435,7 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
@ -452,7 +452,7 @@
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
@ -468,7 +468,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
"expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
"hide": false,
"instant": false,
"legendFormat": "Mean",
@ -476,7 +476,7 @@
"refId": "E"
}
],
"title": "Time Per Output Token Latency",
"title": "Inter Token Latency",
"type": "timeseries"
},
{

View File

@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:inter_token_latency_seconds_sum",
"vllm:inter_token_latency_seconds_bucket",
"vllm:inter_token_latency_seconds_count",
"vllm:e2e_request_latency_seconds_sum",
"vllm:e2e_request_latency_seconds_bucket",
"vllm:e2e_request_latency_seconds_count",
@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
"vllm:request_decode_time_seconds_count",
]
HIDDEN_DEPRECATED_METRICS: list[str] = []
HIDDEN_DEPRECATED_METRICS: list[str] = [
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
]
@pytest.mark.asyncio
@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert response.status_code == HTTPStatus.OK
for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
if (not server.show_hidden_metrics
and metric not in HIDDEN_DEPRECATED_METRICS):
assert metric in response.text
if (metric in HIDDEN_DEPRECATED_METRICS
and not server.show_hidden_metrics):
continue
assert metric in response.text
@pytest.mark.asyncio

View File

@ -1414,7 +1414,7 @@ class LLMEngine:
num_generation_tokens_iter = 0
num_tokens_iter = 0
time_to_first_tokens_iter: List[float] = []
time_per_output_tokens_iter: List[float] = []
inter_token_latencies_iter: List[float] = []
num_preemption_iter = (0 if scheduler_outputs is None else
scheduler_outputs.preempted)
@ -1498,9 +1498,9 @@ class LLMEngine:
num_generation_tokens_from_prefill_groups += (
seq_group.num_seqs())
else:
# TPOTs.
# ITLs
latency = seq_group.get_last_token_latency()
time_per_output_tokens_iter.append(latency)
inter_token_latencies_iter.append(latency)
if seq_group.state.current_step == 0:
# For async_output_proc, the do_log_stats()
# is called following init_multi_step(), which
@ -1582,7 +1582,7 @@ class LLMEngine:
num_generation_tokens_iter=num_generation_tokens_iter,
num_tokens_iter=num_tokens_iter,
time_to_first_tokens_iter=time_to_first_tokens_iter,
time_per_output_tokens_iter=time_per_output_tokens_iter,
inter_token_latencies_iter=inter_token_latencies_iter,
num_preemption_iter=num_preemption_iter,
# Request stats

View File

@ -113,9 +113,21 @@ class Metrics:
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
2560.0
])
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# TODO: in 0.12, only enable if show_hidden_metrics=True
self.histogram_time_per_output_token = self._histogram_cls(
name="vllm:time_per_output_token_seconds",
documentation="Histogram of time per output token in seconds.",
documentation=(
"Histogram of time per output token in seconds."
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
labelnames=labelnames,
buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
])
self.histogram_inter_token_latency = self._histogram_cls(
name="vllm:inter_token_latency_seconds",
documentation="Histogram of inter token latency in seconds.",
labelnames=labelnames,
buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
@ -491,7 +503,9 @@ class PrometheusStatLogger(StatLoggerBase):
self._log_histogram(self.metrics.histogram_time_to_first_token,
stats.time_to_first_tokens_iter)
self._log_histogram(self.metrics.histogram_time_per_output_token,
stats.time_per_output_tokens_iter)
stats.inter_token_latencies_iter)
self._log_histogram(self.metrics.histogram_inter_token_latency,
stats.inter_token_latencies_iter)
# Request level data
# Latency

View File

@ -43,7 +43,7 @@ class Stats:
num_generation_tokens_iter: int
num_tokens_iter: int
time_to_first_tokens_iter: List[float]
time_per_output_tokens_iter: List[float]
inter_token_latencies_iter: List[float]
num_preemption_iter: int
# Request stats (should have _requests suffix)

View File

@ -377,9 +377,13 @@ class PrometheusStatLogger(StatLoggerBase):
self.histogram_time_to_first_token = make_per_engine(
histogram_time_to_first_token, engine_indexes, model_name)
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# TODO: in 0.12, only enable if show_hidden_metrics=True
histogram_time_per_output_token = self._histogram_cls(
name="vllm:time_per_output_token_seconds",
documentation="Histogram of time per output token in seconds.",
documentation=(
"Histogram of time per output token in seconds."
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
@ -388,6 +392,17 @@ class PrometheusStatLogger(StatLoggerBase):
self.histogram_time_per_output_token = make_per_engine(
histogram_time_per_output_token, engine_indexes, model_name)
histogram_inter_token_latency = self._histogram_cls(
name="vllm:inter_token_latency_seconds",
documentation="Histogram of inter-token latency in seconds.",
buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
],
labelnames=labelnames)
self.histogram_inter_token_latency = make_per_engine(
histogram_inter_token_latency, engine_indexes, model_name)
request_latency_buckets = [
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
@ -537,8 +552,9 @@ class PrometheusStatLogger(StatLoggerBase):
self.histogram_n_request[engine_idx].observe(n_param)
for ttft in iteration_stats.time_to_first_tokens_iter:
self.histogram_time_to_first_token[engine_idx].observe(ttft)
for tpot in iteration_stats.time_per_output_tokens_iter:
self.histogram_time_per_output_token[engine_idx].observe(tpot)
for itl in iteration_stats.inter_token_latencies_iter:
self.histogram_inter_token_latency[engine_idx].observe(itl)
self.histogram_time_per_output_token[engine_idx].observe(itl)
for finished_request in iteration_stats.finished_requests:
self.counter_request_success[

View File

@ -96,7 +96,7 @@ class IterationStats:
self.max_num_generation_tokens_iter: list[int] = []
self.n_params_iter: list[int] = []
self.time_to_first_tokens_iter: list[float] = []
self.time_per_output_tokens_iter: list[float] = []
self.inter_token_latencies_iter: list[float] = []
self.waiting_lora_adapters: dict[str, int] = {}
self.running_lora_adapters: dict[str, int] = {}
@ -128,8 +128,8 @@ class IterationStats:
if is_prefilling:
req_stats.first_token_ts = engine_core_timestamp
else:
tpot = engine_core_timestamp - req_stats.last_token_ts
self.time_per_output_tokens_iter.append(tpot)
itl = engine_core_timestamp - req_stats.last_token_ts
self.inter_token_latencies_iter.append(itl)
req_stats.last_token_ts = engine_core_timestamp