From 965c5f4914595a96d3c2d6ee2d0ed9dd00a8d573 Mon Sep 17 00:00:00 2001 From: Tomas Ruiz Date: Fri, 17 Oct 2025 04:55:09 +0200 Subject: [PATCH] vllm bench serve shows num of failed requests (#26478) Signed-off-by: Tomas Ruiz --- vllm/benchmarks/serve.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index c5a52133a471d..89958848c06a8 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -64,6 +64,7 @@ class TaskType(Enum): @dataclass class BenchmarkMetrics: completed: int + failed: int total_input: int total_output: int request_throughput: float @@ -97,6 +98,7 @@ class BenchmarkMetrics: @dataclass class EmbedBenchmarkMetrics: completed: int + failed: int total_input: int request_throughput: float total_token_throughput: float @@ -239,12 +241,15 @@ def calculate_metrics_for_embeddings( """ total_input = 0 completed = 0 + failed = 0 e2els: list[float] = [] for i in range(len(outputs)): if outputs[i].success: e2els.append(outputs[i].latency) completed += 1 total_input += outputs[i].prompt_len + else: + failed += 1 if completed == 0: warnings.warn( @@ -254,6 +259,7 @@ def calculate_metrics_for_embeddings( ) metrics = EmbedBenchmarkMetrics( completed=completed, + failed=failed, total_input=total_input, request_throughput=completed / dur_s, total_token_throughput=total_input / dur_s, @@ -366,6 +372,7 @@ def calculate_metrics( # Find the time range across all successful requests successful_outputs = [output for output in outputs if output.success] + failed_outputs = [output for output in outputs if not output.success] if successful_outputs: min_start_time = min(output.start_time for output in successful_outputs) max_end_time = max( @@ -427,6 +434,7 @@ def calculate_metrics( metrics = BenchmarkMetrics( completed=completed, + failed=len(failed_outputs), total_input=total_input, total_output=sum(actual_output_lens), request_throughput=completed / dur_s, @@ -734,6 +742,7 @@ async def benchmark( print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10}".format("Failed requests:", metrics.failed)) if max_concurrency is not None: print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) if request_rate != float("inf"): @@ -779,6 +788,7 @@ async def benchmark( result = { "duration": benchmark_duration, "completed": metrics.completed, + "failed": metrics.failed, "total_input_tokens": metrics.total_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput,