Adding Warmup to Benchmark Serving (#26943)

Signed-off-by: Kimbo Chen <chentenghung@gmail.com>
This commit is contained in:
kimbochen 2025-10-16 15:44:32 -04:00 committed by GitHub
parent a5464dcf92
commit 013abde6ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -478,6 +478,7 @@ async def benchmark(
request_rate: float,
burstiness: float,
disable_tqdm: bool,
num_warmups: int,
profile: bool,
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
@ -559,10 +560,37 @@ async def benchmark(
f"Error: {test_output.error}"
)
else:
print("Initial test run completed. Starting main benchmark run...")
print("Initial test run completed.")
else:
print("Skipping endpoint ready check.")
if num_warmups > 0:
print(f"Warming up with {num_warmups} requests...")
warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups)
warmup_semaphore = (
asyncio.Semaphore(max_concurrency)
if max_concurrency
else contextlib.nullcontext()
)
warmup_tasks = []
async def warmup_limited_request_func():
async with warmup_semaphore:
return await request_func(
request_func_input=test_input, session=session, pbar=warmup_pbar
)
for _ in range(num_warmups):
request_task = asyncio.create_task(warmup_limited_request_func())
warmup_tasks.append(request_task)
_ = await asyncio.gather(*warmup_tasks)
if warmup_pbar is not None:
warmup_pbar.close()
print("Warmup run completed.")
print("Starting main benchmark run...")
if lora_modules:
# For each input request, choose a LoRA module at random.
lora_modules = iter(
@ -1029,6 +1057,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
action="store_true",
help="Specify to disable tqdm progress bar.",
)
parser.add_argument(
"--num-warmups",
type=int,
default=0,
help="Number of warmup requests.",
)
parser.add_argument(
"--profile",
action="store_true",
@ -1370,6 +1404,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
request_rate=args.request_rate,
burstiness=args.burstiness,
disable_tqdm=args.disable_tqdm,
num_warmups=args.num_warmups,
profile=args.profile,
selected_percentile_metrics=percentile_metrics.split(","),
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],