[benchmark] Port benchmark request sent optimization to benchmark_serving (#21209)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-12-10 08:34:56 +08:00 · 2025-07-22 05:28:00 -07:00 · 2025-07-22 05:28:00 -07:00 · 10904e6d75
commit 10904e6d75
parent a32237665d
2 changed files with 7 additions and 101 deletions
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -30,7 +30,7 @@ import os
 import random
 import time
 import warnings
-from collections.abc import AsyncGenerator, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Literal, Optional
@ -73,6 +73,7 @@ from benchmark_dataset import (
    VisionArenaDataset,
 )
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.benchmarks.serve import get_request
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -107,101 +108,6 @@ class BenchmarkMetrics:
    percentiles_e2el_ms: list[tuple[float, float]]
 def _get_current_request_rate(
    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
    ramp_up_start_rps: Optional[int],
    ramp_up_end_rps: Optional[int],
    request_index: int,
    total_requests: int,
    request_rate: float,
 ) -> float:
    if (
        ramp_up_strategy
        and ramp_up_start_rps is not None
        and ramp_up_end_rps is not None
    ):
        progress = request_index / max(total_requests - 1, 1)
        if ramp_up_strategy == "linear":
            increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
            return ramp_up_start_rps + increase
        elif ramp_up_strategy == "exponential":
            ratio = ramp_up_end_rps / ramp_up_start_rps
            return ramp_up_start_rps * (ratio**progress)
        else:
            raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
    return request_rate
 async def get_request(
    input_requests: list[SampleRequest],
    request_rate: float,
    burstiness: float = 1.0,
    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
    ramp_up_start_rps: Optional[int] = None,
    ramp_up_end_rps: Optional[int] = None,
 ) -> AsyncGenerator[tuple[SampleRequest, float], None]:
    """
    Asynchronously generates requests at a specified rate
    with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
    Args:
        input_requests:
            A list of input requests, each represented as a SampleRequest.
        request_rate:
            The rate at which requests are generated (requests/s).
        burstiness (optional):
            The burstiness factor of the request generation.
            Only takes effect when request_rate is not inf.
            Default value is 1, which follows a Poisson process.
            Otherwise, the request intervals follow a gamma distribution.
            A lower burstiness value (0 < burstiness < 1) results
            in more bursty requests, while a higher burstiness value
            (burstiness > 1) results in a more uniform arrival of requests.
         ramp_up_strategy (optional):
            The ramp-up strategy. Can be "linear" or "exponential".
            If None, uses constant request rate (specified by request_rate).
        ramp_up_start_rps (optional):
            The starting request rate for ramp-up.
        ramp_up_end_rps (optional):
            The ending request rate for ramp-up.
    """
    assert burstiness > 0, (
        f"A positive burstiness factor is expected, but given {burstiness}."
    )
    # Convert to list to get length for ramp-up calculations
    if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
        input_requests = list(input_requests)
    total_requests = len(input_requests)
    request_index = 0
    for request in input_requests:
        current_request_rate = _get_current_request_rate(
            ramp_up_strategy,
            ramp_up_start_rps,
            ramp_up_end_rps,
            request_index,
            total_requests,
            request_rate,
        )
        yield request, current_request_rate
        request_index += 1
        if current_request_rate == float("inf"):
            # If the request rate is infinity, then we don't need to wait.
            continue
        theta = 1.0 / (current_request_rate * burstiness)
        # Sample the request interval from the gamma distribution.
        # If burstiness is 1, it follows exponential distribution.
        interval = np.random.gamma(shape=burstiness, scale=theta)
        # The next request will be sent after the interval.
        await asyncio.sleep(interval)
 def calculate_metrics(
    input_requests: list[SampleRequest],
    outputs: list[RequestFuncOutput],
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@ -179,12 +179,12 @@ async def get_request(
        delay_ts = [delay * normalize_factor for delay in delay_ts]
    start_ts = time.time()
    request_index = 0
    for request_index, request in enumerate(input_requests):
-        current_ts = time.time()
+        if delay_ts[request_index] > 0:
-        sleep_interval_s = start_ts + delay_ts[request_index] - current_ts
+            current_ts = time.time()
-        if sleep_interval_s > 0:
+            sleep_interval_s = start_ts + delay_ts[request_index] - current_ts
-            await asyncio.sleep(sleep_interval_s)
+            if sleep_interval_s > 0:
                await asyncio.sleep(sleep_interval_s)
        yield request, request_rates[request_index]