mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 08:34:56 +08:00
[benchmark] Port benchmark request sent optimization to benchmark_serving (#21209)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
This commit is contained in:
parent
a32237665d
commit
10904e6d75
@ -30,7 +30,7 @@ import os
|
|||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator, Iterable
|
from collections.abc import Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Literal, Optional
|
from typing import Any, Literal, Optional
|
||||||
@ -73,6 +73,7 @@ from benchmark_dataset import (
|
|||||||
VisionArenaDataset,
|
VisionArenaDataset,
|
||||||
)
|
)
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
|
from vllm.benchmarks.serve import get_request
|
||||||
|
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||||
|
|
||||||
@ -107,101 +108,6 @@ class BenchmarkMetrics:
|
|||||||
percentiles_e2el_ms: list[tuple[float, float]]
|
percentiles_e2el_ms: list[tuple[float, float]]
|
||||||
|
|
||||||
|
|
||||||
def _get_current_request_rate(
|
|
||||||
ramp_up_strategy: Optional[Literal["linear", "exponential"]],
|
|
||||||
ramp_up_start_rps: Optional[int],
|
|
||||||
ramp_up_end_rps: Optional[int],
|
|
||||||
request_index: int,
|
|
||||||
total_requests: int,
|
|
||||||
request_rate: float,
|
|
||||||
) -> float:
|
|
||||||
if (
|
|
||||||
ramp_up_strategy
|
|
||||||
and ramp_up_start_rps is not None
|
|
||||||
and ramp_up_end_rps is not None
|
|
||||||
):
|
|
||||||
progress = request_index / max(total_requests - 1, 1)
|
|
||||||
if ramp_up_strategy == "linear":
|
|
||||||
increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
|
|
||||||
return ramp_up_start_rps + increase
|
|
||||||
elif ramp_up_strategy == "exponential":
|
|
||||||
ratio = ramp_up_end_rps / ramp_up_start_rps
|
|
||||||
return ramp_up_start_rps * (ratio**progress)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
|
|
||||||
return request_rate
|
|
||||||
|
|
||||||
|
|
||||||
async def get_request(
|
|
||||||
input_requests: list[SampleRequest],
|
|
||||||
request_rate: float,
|
|
||||||
burstiness: float = 1.0,
|
|
||||||
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
|
|
||||||
ramp_up_start_rps: Optional[int] = None,
|
|
||||||
ramp_up_end_rps: Optional[int] = None,
|
|
||||||
) -> AsyncGenerator[tuple[SampleRequest, float], None]:
|
|
||||||
"""
|
|
||||||
Asynchronously generates requests at a specified rate
|
|
||||||
with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
input_requests:
|
|
||||||
A list of input requests, each represented as a SampleRequest.
|
|
||||||
request_rate:
|
|
||||||
The rate at which requests are generated (requests/s).
|
|
||||||
burstiness (optional):
|
|
||||||
The burstiness factor of the request generation.
|
|
||||||
Only takes effect when request_rate is not inf.
|
|
||||||
Default value is 1, which follows a Poisson process.
|
|
||||||
Otherwise, the request intervals follow a gamma distribution.
|
|
||||||
A lower burstiness value (0 < burstiness < 1) results
|
|
||||||
in more bursty requests, while a higher burstiness value
|
|
||||||
(burstiness > 1) results in a more uniform arrival of requests.
|
|
||||||
ramp_up_strategy (optional):
|
|
||||||
The ramp-up strategy. Can be "linear" or "exponential".
|
|
||||||
If None, uses constant request rate (specified by request_rate).
|
|
||||||
ramp_up_start_rps (optional):
|
|
||||||
The starting request rate for ramp-up.
|
|
||||||
ramp_up_end_rps (optional):
|
|
||||||
The ending request rate for ramp-up.
|
|
||||||
"""
|
|
||||||
assert burstiness > 0, (
|
|
||||||
f"A positive burstiness factor is expected, but given {burstiness}."
|
|
||||||
)
|
|
||||||
# Convert to list to get length for ramp-up calculations
|
|
||||||
if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
|
|
||||||
input_requests = list(input_requests)
|
|
||||||
|
|
||||||
total_requests = len(input_requests)
|
|
||||||
request_index = 0
|
|
||||||
|
|
||||||
for request in input_requests:
|
|
||||||
current_request_rate = _get_current_request_rate(
|
|
||||||
ramp_up_strategy,
|
|
||||||
ramp_up_start_rps,
|
|
||||||
ramp_up_end_rps,
|
|
||||||
request_index,
|
|
||||||
total_requests,
|
|
||||||
request_rate,
|
|
||||||
)
|
|
||||||
|
|
||||||
yield request, current_request_rate
|
|
||||||
|
|
||||||
request_index += 1
|
|
||||||
|
|
||||||
if current_request_rate == float("inf"):
|
|
||||||
# If the request rate is infinity, then we don't need to wait.
|
|
||||||
continue
|
|
||||||
|
|
||||||
theta = 1.0 / (current_request_rate * burstiness)
|
|
||||||
|
|
||||||
# Sample the request interval from the gamma distribution.
|
|
||||||
# If burstiness is 1, it follows exponential distribution.
|
|
||||||
interval = np.random.gamma(shape=burstiness, scale=theta)
|
|
||||||
# The next request will be sent after the interval.
|
|
||||||
await asyncio.sleep(interval)
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_metrics(
|
def calculate_metrics(
|
||||||
input_requests: list[SampleRequest],
|
input_requests: list[SampleRequest],
|
||||||
outputs: list[RequestFuncOutput],
|
outputs: list[RequestFuncOutput],
|
||||||
|
|||||||
@ -179,12 +179,12 @@ async def get_request(
|
|||||||
delay_ts = [delay * normalize_factor for delay in delay_ts]
|
delay_ts = [delay * normalize_factor for delay in delay_ts]
|
||||||
|
|
||||||
start_ts = time.time()
|
start_ts = time.time()
|
||||||
request_index = 0
|
|
||||||
for request_index, request in enumerate(input_requests):
|
for request_index, request in enumerate(input_requests):
|
||||||
current_ts = time.time()
|
if delay_ts[request_index] > 0:
|
||||||
sleep_interval_s = start_ts + delay_ts[request_index] - current_ts
|
current_ts = time.time()
|
||||||
if sleep_interval_s > 0:
|
sleep_interval_s = start_ts + delay_ts[request_index] - current_ts
|
||||||
await asyncio.sleep(sleep_interval_s)
|
if sleep_interval_s > 0:
|
||||||
|
await asyncio.sleep(sleep_interval_s)
|
||||||
yield request, request_rates[request_index]
|
yield request, request_rates[request_index]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user