mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-02 00:57:57 +08:00
vllm bench serve shows num of failed requests (#26478)
Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
This commit is contained in:
parent
4d055ef465
commit
965c5f4914
@ -64,6 +64,7 @@ class TaskType(Enum):
|
|||||||
@dataclass
|
@dataclass
|
||||||
class BenchmarkMetrics:
|
class BenchmarkMetrics:
|
||||||
completed: int
|
completed: int
|
||||||
|
failed: int
|
||||||
total_input: int
|
total_input: int
|
||||||
total_output: int
|
total_output: int
|
||||||
request_throughput: float
|
request_throughput: float
|
||||||
@ -97,6 +98,7 @@ class BenchmarkMetrics:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class EmbedBenchmarkMetrics:
|
class EmbedBenchmarkMetrics:
|
||||||
completed: int
|
completed: int
|
||||||
|
failed: int
|
||||||
total_input: int
|
total_input: int
|
||||||
request_throughput: float
|
request_throughput: float
|
||||||
total_token_throughput: float
|
total_token_throughput: float
|
||||||
@ -239,12 +241,15 @@ def calculate_metrics_for_embeddings(
|
|||||||
"""
|
"""
|
||||||
total_input = 0
|
total_input = 0
|
||||||
completed = 0
|
completed = 0
|
||||||
|
failed = 0
|
||||||
e2els: list[float] = []
|
e2els: list[float] = []
|
||||||
for i in range(len(outputs)):
|
for i in range(len(outputs)):
|
||||||
if outputs[i].success:
|
if outputs[i].success:
|
||||||
e2els.append(outputs[i].latency)
|
e2els.append(outputs[i].latency)
|
||||||
completed += 1
|
completed += 1
|
||||||
total_input += outputs[i].prompt_len
|
total_input += outputs[i].prompt_len
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
|
||||||
if completed == 0:
|
if completed == 0:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
@ -254,6 +259,7 @@ def calculate_metrics_for_embeddings(
|
|||||||
)
|
)
|
||||||
metrics = EmbedBenchmarkMetrics(
|
metrics = EmbedBenchmarkMetrics(
|
||||||
completed=completed,
|
completed=completed,
|
||||||
|
failed=failed,
|
||||||
total_input=total_input,
|
total_input=total_input,
|
||||||
request_throughput=completed / dur_s,
|
request_throughput=completed / dur_s,
|
||||||
total_token_throughput=total_input / dur_s,
|
total_token_throughput=total_input / dur_s,
|
||||||
@ -366,6 +372,7 @@ def calculate_metrics(
|
|||||||
|
|
||||||
# Find the time range across all successful requests
|
# Find the time range across all successful requests
|
||||||
successful_outputs = [output for output in outputs if output.success]
|
successful_outputs = [output for output in outputs if output.success]
|
||||||
|
failed_outputs = [output for output in outputs if not output.success]
|
||||||
if successful_outputs:
|
if successful_outputs:
|
||||||
min_start_time = min(output.start_time for output in successful_outputs)
|
min_start_time = min(output.start_time for output in successful_outputs)
|
||||||
max_end_time = max(
|
max_end_time = max(
|
||||||
@ -427,6 +434,7 @@ def calculate_metrics(
|
|||||||
|
|
||||||
metrics = BenchmarkMetrics(
|
metrics = BenchmarkMetrics(
|
||||||
completed=completed,
|
completed=completed,
|
||||||
|
failed=len(failed_outputs),
|
||||||
total_input=total_input,
|
total_input=total_input,
|
||||||
total_output=sum(actual_output_lens),
|
total_output=sum(actual_output_lens),
|
||||||
request_throughput=completed / dur_s,
|
request_throughput=completed / dur_s,
|
||||||
@ -734,6 +742,7 @@ async def benchmark(
|
|||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||||
|
print("{:<40} {:<10}".format("Failed requests:", metrics.failed))
|
||||||
if max_concurrency is not None:
|
if max_concurrency is not None:
|
||||||
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
|
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
|
||||||
if request_rate != float("inf"):
|
if request_rate != float("inf"):
|
||||||
@ -779,6 +788,7 @@ async def benchmark(
|
|||||||
result = {
|
result = {
|
||||||
"duration": benchmark_duration,
|
"duration": benchmark_duration,
|
||||||
"completed": metrics.completed,
|
"completed": metrics.completed,
|
||||||
|
"failed": metrics.failed,
|
||||||
"total_input_tokens": metrics.total_input,
|
"total_input_tokens": metrics.total_input,
|
||||||
"total_output_tokens": metrics.total_output,
|
"total_output_tokens": metrics.total_output,
|
||||||
"request_throughput": metrics.request_throughput,
|
"request_throughput": metrics.request_throughput,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user