mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-04 20:17:03 +08:00
[Metrics] Deprecate TPOT in favor of ITL (#24110)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
parent
9480ae24e3
commit
2417798471
@ -402,7 +402,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": false,
|
||||
"instant": false,
|
||||
@ -418,7 +418,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": false,
|
||||
@ -435,7 +435,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": false,
|
||||
@ -452,7 +452,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
"includeNullMetadata": false,
|
||||
@ -468,7 +468,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
|
||||
"expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "Mean",
|
||||
@ -476,7 +476,7 @@
|
||||
"refId": "E"
|
||||
}
|
||||
],
|
||||
"title": "Time Per Output Token Latency",
|
||||
"title": "Inter Token Latency",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
|
||||
@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:request_params_max_tokens_sum",
|
||||
"vllm:request_params_max_tokens_bucket",
|
||||
"vllm:request_params_max_tokens_count",
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
"vllm:inter_token_latency_seconds_sum",
|
||||
"vllm:inter_token_latency_seconds_bucket",
|
||||
"vllm:inter_token_latency_seconds_count",
|
||||
"vllm:e2e_request_latency_seconds_sum",
|
||||
"vllm:e2e_request_latency_seconds_bucket",
|
||||
"vllm:e2e_request_latency_seconds_count",
|
||||
@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
|
||||
"vllm:request_decode_time_seconds_count",
|
||||
]
|
||||
|
||||
HIDDEN_DEPRECATED_METRICS: list[str] = []
|
||||
HIDDEN_DEPRECATED_METRICS: list[str] = [
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
|
||||
if (not server.show_hidden_metrics
|
||||
and metric not in HIDDEN_DEPRECATED_METRICS):
|
||||
assert metric in response.text
|
||||
if (metric in HIDDEN_DEPRECATED_METRICS
|
||||
and not server.show_hidden_metrics):
|
||||
continue
|
||||
assert metric in response.text
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -1414,7 +1414,7 @@ class LLMEngine:
|
||||
num_generation_tokens_iter = 0
|
||||
num_tokens_iter = 0
|
||||
time_to_first_tokens_iter: List[float] = []
|
||||
time_per_output_tokens_iter: List[float] = []
|
||||
inter_token_latencies_iter: List[float] = []
|
||||
num_preemption_iter = (0 if scheduler_outputs is None else
|
||||
scheduler_outputs.preempted)
|
||||
|
||||
@ -1498,9 +1498,9 @@ class LLMEngine:
|
||||
num_generation_tokens_from_prefill_groups += (
|
||||
seq_group.num_seqs())
|
||||
else:
|
||||
# TPOTs.
|
||||
# ITLs
|
||||
latency = seq_group.get_last_token_latency()
|
||||
time_per_output_tokens_iter.append(latency)
|
||||
inter_token_latencies_iter.append(latency)
|
||||
if seq_group.state.current_step == 0:
|
||||
# For async_output_proc, the do_log_stats()
|
||||
# is called following init_multi_step(), which
|
||||
@ -1582,7 +1582,7 @@ class LLMEngine:
|
||||
num_generation_tokens_iter=num_generation_tokens_iter,
|
||||
num_tokens_iter=num_tokens_iter,
|
||||
time_to_first_tokens_iter=time_to_first_tokens_iter,
|
||||
time_per_output_tokens_iter=time_per_output_tokens_iter,
|
||||
inter_token_latencies_iter=inter_token_latencies_iter,
|
||||
num_preemption_iter=num_preemption_iter,
|
||||
|
||||
# Request stats
|
||||
|
||||
@ -113,9 +113,21 @@ class Metrics:
|
||||
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
|
||||
2560.0
|
||||
])
|
||||
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
|
||||
# TODO: in 0.12, only enable if show_hidden_metrics=True
|
||||
self.histogram_time_per_output_token = self._histogram_cls(
|
||||
name="vllm:time_per_output_token_seconds",
|
||||
documentation="Histogram of time per output token in seconds.",
|
||||
documentation=(
|
||||
"Histogram of time per output token in seconds."
|
||||
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
||||
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
|
||||
])
|
||||
self.histogram_inter_token_latency = self._histogram_cls(
|
||||
name="vllm:inter_token_latency_seconds",
|
||||
documentation="Histogram of inter token latency in seconds.",
|
||||
labelnames=labelnames,
|
||||
buckets=[
|
||||
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
||||
@ -491,7 +503,9 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self._log_histogram(self.metrics.histogram_time_to_first_token,
|
||||
stats.time_to_first_tokens_iter)
|
||||
self._log_histogram(self.metrics.histogram_time_per_output_token,
|
||||
stats.time_per_output_tokens_iter)
|
||||
stats.inter_token_latencies_iter)
|
||||
self._log_histogram(self.metrics.histogram_inter_token_latency,
|
||||
stats.inter_token_latencies_iter)
|
||||
|
||||
# Request level data
|
||||
# Latency
|
||||
|
||||
@ -43,7 +43,7 @@ class Stats:
|
||||
num_generation_tokens_iter: int
|
||||
num_tokens_iter: int
|
||||
time_to_first_tokens_iter: List[float]
|
||||
time_per_output_tokens_iter: List[float]
|
||||
inter_token_latencies_iter: List[float]
|
||||
num_preemption_iter: int
|
||||
|
||||
# Request stats (should have _requests suffix)
|
||||
|
||||
@ -377,9 +377,13 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self.histogram_time_to_first_token = make_per_engine(
|
||||
histogram_time_to_first_token, engine_indexes, model_name)
|
||||
|
||||
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
|
||||
# TODO: in 0.12, only enable if show_hidden_metrics=True
|
||||
histogram_time_per_output_token = self._histogram_cls(
|
||||
name="vllm:time_per_output_token_seconds",
|
||||
documentation="Histogram of time per output token in seconds.",
|
||||
documentation=(
|
||||
"Histogram of time per output token in seconds."
|
||||
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
|
||||
buckets=[
|
||||
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
||||
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
|
||||
@ -388,6 +392,17 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self.histogram_time_per_output_token = make_per_engine(
|
||||
histogram_time_per_output_token, engine_indexes, model_name)
|
||||
|
||||
histogram_inter_token_latency = self._histogram_cls(
|
||||
name="vllm:inter_token_latency_seconds",
|
||||
documentation="Histogram of inter-token latency in seconds.",
|
||||
buckets=[
|
||||
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
||||
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
|
||||
],
|
||||
labelnames=labelnames)
|
||||
self.histogram_inter_token_latency = make_per_engine(
|
||||
histogram_inter_token_latency, engine_indexes, model_name)
|
||||
|
||||
request_latency_buckets = [
|
||||
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
|
||||
40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
|
||||
@ -537,8 +552,9 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self.histogram_n_request[engine_idx].observe(n_param)
|
||||
for ttft in iteration_stats.time_to_first_tokens_iter:
|
||||
self.histogram_time_to_first_token[engine_idx].observe(ttft)
|
||||
for tpot in iteration_stats.time_per_output_tokens_iter:
|
||||
self.histogram_time_per_output_token[engine_idx].observe(tpot)
|
||||
for itl in iteration_stats.inter_token_latencies_iter:
|
||||
self.histogram_inter_token_latency[engine_idx].observe(itl)
|
||||
self.histogram_time_per_output_token[engine_idx].observe(itl)
|
||||
|
||||
for finished_request in iteration_stats.finished_requests:
|
||||
self.counter_request_success[
|
||||
|
||||
@ -96,7 +96,7 @@ class IterationStats:
|
||||
self.max_num_generation_tokens_iter: list[int] = []
|
||||
self.n_params_iter: list[int] = []
|
||||
self.time_to_first_tokens_iter: list[float] = []
|
||||
self.time_per_output_tokens_iter: list[float] = []
|
||||
self.inter_token_latencies_iter: list[float] = []
|
||||
self.waiting_lora_adapters: dict[str, int] = {}
|
||||
self.running_lora_adapters: dict[str, int] = {}
|
||||
|
||||
@ -128,8 +128,8 @@ class IterationStats:
|
||||
if is_prefilling:
|
||||
req_stats.first_token_ts = engine_core_timestamp
|
||||
else:
|
||||
tpot = engine_core_timestamp - req_stats.last_token_ts
|
||||
self.time_per_output_tokens_iter.append(tpot)
|
||||
itl = engine_core_timestamp - req_stats.last_token_ts
|
||||
self.inter_token_latencies_iter.append(itl)
|
||||
|
||||
req_stats.last_token_ts = engine_core_timestamp
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user