From 10904e6d755051260a7c3ce98659d8907c74caa9 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Tue, 22 Jul 2025 05:28:00 -0700 Subject: [PATCH] [benchmark] Port benchmark request sent optimization to benchmark_serving (#21209) Signed-off-by: Jialin Ouyang --- benchmarks/benchmark_serving.py | 98 +-------------------------------- vllm/benchmarks/serve.py | 10 ++-- 2 files changed, 7 insertions(+), 101 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index f3a20842137e..c597fb1068ab 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -30,7 +30,7 @@ import os import random import time import warnings -from collections.abc import AsyncGenerator, Iterable +from collections.abc import Iterable from dataclasses import dataclass from datetime import datetime from typing import Any, Literal, Optional @@ -73,6 +73,7 @@ from benchmark_dataset import ( VisionArenaDataset, ) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json +from vllm.benchmarks.serve import get_request MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -107,101 +108,6 @@ class BenchmarkMetrics: percentiles_e2el_ms: list[tuple[float, float]] -def _get_current_request_rate( - ramp_up_strategy: Optional[Literal["linear", "exponential"]], - ramp_up_start_rps: Optional[int], - ramp_up_end_rps: Optional[int], - request_index: int, - total_requests: int, - request_rate: float, -) -> float: - if ( - ramp_up_strategy - and ramp_up_start_rps is not None - and ramp_up_end_rps is not None - ): - progress = request_index / max(total_requests - 1, 1) - if ramp_up_strategy == "linear": - increase = (ramp_up_end_rps - ramp_up_start_rps) * progress - return ramp_up_start_rps + increase - elif ramp_up_strategy == "exponential": - ratio = ramp_up_end_rps / ramp_up_start_rps - return ramp_up_start_rps * (ratio**progress) - else: - raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}") - return request_rate - - -async def get_request( - input_requests: list[SampleRequest], - request_rate: float, - burstiness: float = 1.0, - ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, - ramp_up_start_rps: Optional[int] = None, - ramp_up_end_rps: Optional[int] = None, -) -> AsyncGenerator[tuple[SampleRequest, float], None]: - """ - Asynchronously generates requests at a specified rate - with OPTIONAL burstiness and OPTIONAL ramp-up strategy. - - Args: - input_requests: - A list of input requests, each represented as a SampleRequest. - request_rate: - The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. - Only takes effect when request_rate is not inf. - Default value is 1, which follows a Poisson process. - Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value - (burstiness > 1) results in a more uniform arrival of requests. - ramp_up_strategy (optional): - The ramp-up strategy. Can be "linear" or "exponential". - If None, uses constant request rate (specified by request_rate). - ramp_up_start_rps (optional): - The starting request rate for ramp-up. - ramp_up_end_rps (optional): - The ending request rate for ramp-up. - """ - assert burstiness > 0, ( - f"A positive burstiness factor is expected, but given {burstiness}." - ) - # Convert to list to get length for ramp-up calculations - if isinstance(input_requests, Iterable) and not isinstance(input_requests, list): - input_requests = list(input_requests) - - total_requests = len(input_requests) - request_index = 0 - - for request in input_requests: - current_request_rate = _get_current_request_rate( - ramp_up_strategy, - ramp_up_start_rps, - ramp_up_end_rps, - request_index, - total_requests, - request_rate, - ) - - yield request, current_request_rate - - request_index += 1 - - if current_request_rate == float("inf"): - # If the request rate is infinity, then we don't need to wait. - continue - - theta = 1.0 / (current_request_rate * burstiness) - - # Sample the request interval from the gamma distribution. - # If burstiness is 1, it follows exponential distribution. - interval = np.random.gamma(shape=burstiness, scale=theta) - # The next request will be sent after the interval. - await asyncio.sleep(interval) - - def calculate_metrics( input_requests: list[SampleRequest], outputs: list[RequestFuncOutput], diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index a4d51936320b..f4506c9ce6f4 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -179,12 +179,12 @@ async def get_request( delay_ts = [delay * normalize_factor for delay in delay_ts] start_ts = time.time() - request_index = 0 for request_index, request in enumerate(input_requests): - current_ts = time.time() - sleep_interval_s = start_ts + delay_ts[request_index] - current_ts - if sleep_interval_s > 0: - await asyncio.sleep(sleep_interval_s) + if delay_ts[request_index] > 0: + current_ts = time.time() + sleep_interval_s = start_ts + delay_ts[request_index] - current_ts + if sleep_interval_s > 0: + await asyncio.sleep(sleep_interval_s) yield request, request_rates[request_index]