diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index f0076b2d81db..52264e41e7a1 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -411,6 +411,19 @@ class PrometheusStatLogger(StatLoggerBase): self.histogram_inter_token_latency = make_per_engine( histogram_inter_token_latency, engine_indexes, model_name) + histogram_request_time_per_output_token = self._histogram_cls( + name="vllm:request_time_per_output_token_seconds", + documentation= + "Histogram of time_per_output_token_seconds per request.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ], + labelnames=labelnames) + self.histogram_request_time_per_output_token = make_per_engine( + histogram_request_time_per_output_token, engine_indexes, + model_name) + request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 @@ -583,6 +596,8 @@ class PrometheusStatLogger(StatLoggerBase): finished_request.num_prompt_tokens) self.histogram_num_generation_tokens_request[engine_idx].observe( finished_request.num_generation_tokens) + self.histogram_request_time_per_output_token[engine_idx].observe( + finished_request.mean_time_per_output_token) if finished_request.max_tokens_param: self.histogram_max_tokens_request[engine_idx].observe( finished_request.max_tokens_param) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 0eff557336bc..296c39e8cdb5 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -86,6 +86,7 @@ class FinishedRequestStats: prefill_time: float = 0.0 inference_time: float = 0.0 decode_time: float = 0.0 + mean_time_per_output_token: float = 0.0 class IterationStats: @@ -177,6 +178,12 @@ class IterationStats: # Any preemptions during prefill or decode are included inference_time = req_stats.last_token_ts - req_stats.scheduled_ts + # Do not count the token generated by the prefill phase + mean_time_per_output_token = (decode_time / + (req_stats.num_generation_tokens - 1) + if req_stats.num_generation_tokens - + 1 > 0 else 0) + finished_req = \ FinishedRequestStats(finish_reason=finish_reason, e2e_latency=e2e_latency, @@ -186,7 +193,8 @@ class IterationStats: queued_time=queued_time, prefill_time=prefill_time, inference_time=inference_time, - decode_time=decode_time) + decode_time=decode_time, + mean_time_per_output_token=mean_time_per_output_token) self.finished_requests.append(finished_req)