vllm bench serve shows num of failed requests (#26478)

Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
This commit is contained in:
Tomas Ruiz 2025-10-17 04:55:09 +02:00 committed by GitHub
parent 4d055ef465
commit 965c5f4914
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -64,6 +64,7 @@ class TaskType(Enum):
@dataclass @dataclass
class BenchmarkMetrics: class BenchmarkMetrics:
completed: int completed: int
failed: int
total_input: int total_input: int
total_output: int total_output: int
request_throughput: float request_throughput: float
@ -97,6 +98,7 @@ class BenchmarkMetrics:
@dataclass @dataclass
class EmbedBenchmarkMetrics: class EmbedBenchmarkMetrics:
completed: int completed: int
failed: int
total_input: int total_input: int
request_throughput: float request_throughput: float
total_token_throughput: float total_token_throughput: float
@ -239,12 +241,15 @@ def calculate_metrics_for_embeddings(
""" """
total_input = 0 total_input = 0
completed = 0 completed = 0
failed = 0
e2els: list[float] = [] e2els: list[float] = []
for i in range(len(outputs)): for i in range(len(outputs)):
if outputs[i].success: if outputs[i].success:
e2els.append(outputs[i].latency) e2els.append(outputs[i].latency)
completed += 1 completed += 1
total_input += outputs[i].prompt_len total_input += outputs[i].prompt_len
else:
failed += 1
if completed == 0: if completed == 0:
warnings.warn( warnings.warn(
@ -254,6 +259,7 @@ def calculate_metrics_for_embeddings(
) )
metrics = EmbedBenchmarkMetrics( metrics = EmbedBenchmarkMetrics(
completed=completed, completed=completed,
failed=failed,
total_input=total_input, total_input=total_input,
request_throughput=completed / dur_s, request_throughput=completed / dur_s,
total_token_throughput=total_input / dur_s, total_token_throughput=total_input / dur_s,
@ -366,6 +372,7 @@ def calculate_metrics(
# Find the time range across all successful requests # Find the time range across all successful requests
successful_outputs = [output for output in outputs if output.success] successful_outputs = [output for output in outputs if output.success]
failed_outputs = [output for output in outputs if not output.success]
if successful_outputs: if successful_outputs:
min_start_time = min(output.start_time for output in successful_outputs) min_start_time = min(output.start_time for output in successful_outputs)
max_end_time = max( max_end_time = max(
@ -427,6 +434,7 @@ def calculate_metrics(
metrics = BenchmarkMetrics( metrics = BenchmarkMetrics(
completed=completed, completed=completed,
failed=len(failed_outputs),
total_input=total_input, total_input=total_input,
total_output=sum(actual_output_lens), total_output=sum(actual_output_lens),
request_throughput=completed / dur_s, request_throughput=completed / dur_s,
@ -734,6 +742,7 @@ async def benchmark(
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
print("{:<40} {:<10}".format("Failed requests:", metrics.failed))
if max_concurrency is not None: if max_concurrency is not None:
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
if request_rate != float("inf"): if request_rate != float("inf"):
@ -779,6 +788,7 @@ async def benchmark(
result = { result = {
"duration": benchmark_duration, "duration": benchmark_duration,
"completed": metrics.completed, "completed": metrics.completed,
"failed": metrics.failed,
"total_input_tokens": metrics.total_input, "total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output, "total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput, "request_throughput": metrics.request_throughput,