mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 06:14:57 +08:00
[V1][Metrics] Add per-request TPOT histogram (#24015)
Signed-off-by: baxingpiaochong <771405853@qq.com>
This commit is contained in:
parent
be0bb568c9
commit
d06b5a95cb
@ -411,6 +411,19 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
self.histogram_inter_token_latency = make_per_engine(
|
||||
histogram_inter_token_latency, engine_indexes, model_name)
|
||||
|
||||
histogram_request_time_per_output_token = self._histogram_cls(
|
||||
name="vllm:request_time_per_output_token_seconds",
|
||||
documentation=
|
||||
"Histogram of time_per_output_token_seconds per request.",
|
||||
buckets=[
|
||||
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
|
||||
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
|
||||
],
|
||||
labelnames=labelnames)
|
||||
self.histogram_request_time_per_output_token = make_per_engine(
|
||||
histogram_request_time_per_output_token, engine_indexes,
|
||||
model_name)
|
||||
|
||||
request_latency_buckets = [
|
||||
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
|
||||
40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
|
||||
@ -583,6 +596,8 @@ class PrometheusStatLogger(StatLoggerBase):
|
||||
finished_request.num_prompt_tokens)
|
||||
self.histogram_num_generation_tokens_request[engine_idx].observe(
|
||||
finished_request.num_generation_tokens)
|
||||
self.histogram_request_time_per_output_token[engine_idx].observe(
|
||||
finished_request.mean_time_per_output_token)
|
||||
if finished_request.max_tokens_param:
|
||||
self.histogram_max_tokens_request[engine_idx].observe(
|
||||
finished_request.max_tokens_param)
|
||||
|
||||
@ -86,6 +86,7 @@ class FinishedRequestStats:
|
||||
prefill_time: float = 0.0
|
||||
inference_time: float = 0.0
|
||||
decode_time: float = 0.0
|
||||
mean_time_per_output_token: float = 0.0
|
||||
|
||||
|
||||
class IterationStats:
|
||||
@ -177,6 +178,12 @@ class IterationStats:
|
||||
# Any preemptions during prefill or decode are included
|
||||
inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
|
||||
|
||||
# Do not count the token generated by the prefill phase
|
||||
mean_time_per_output_token = (decode_time /
|
||||
(req_stats.num_generation_tokens - 1)
|
||||
if req_stats.num_generation_tokens -
|
||||
1 > 0 else 0)
|
||||
|
||||
finished_req = \
|
||||
FinishedRequestStats(finish_reason=finish_reason,
|
||||
e2e_latency=e2e_latency,
|
||||
@ -186,7 +193,8 @@ class IterationStats:
|
||||
queued_time=queued_time,
|
||||
prefill_time=prefill_time,
|
||||
inference_time=inference_time,
|
||||
decode_time=decode_time)
|
||||
decode_time=decode_time,
|
||||
mean_time_per_output_token=mean_time_per_output_token)
|
||||
self.finished_requests.append(finished_req)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user