From 013abde6ef02e55abb393d3baff855eea9693479 Mon Sep 17 00:00:00 2001 From: kimbochen <40498137+kimbochen@users.noreply.github.com> Date: Thu, 16 Oct 2025 15:44:32 -0400 Subject: [PATCH] Adding Warmup to Benchmark Serving (#26943) Signed-off-by: Kimbo Chen --- vllm/benchmarks/serve.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 79e39fb86b326..c5a52133a471d 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -478,6 +478,7 @@ async def benchmark( request_rate: float, burstiness: float, disable_tqdm: bool, + num_warmups: int, profile: bool, selected_percentile_metrics: list[str], selected_percentiles: list[float], @@ -559,10 +560,37 @@ async def benchmark( f"Error: {test_output.error}" ) else: - print("Initial test run completed. Starting main benchmark run...") + print("Initial test run completed.") else: print("Skipping endpoint ready check.") + if num_warmups > 0: + print(f"Warming up with {num_warmups} requests...") + warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups) + warmup_semaphore = ( + asyncio.Semaphore(max_concurrency) + if max_concurrency + else contextlib.nullcontext() + ) + warmup_tasks = [] + + async def warmup_limited_request_func(): + async with warmup_semaphore: + return await request_func( + request_func_input=test_input, session=session, pbar=warmup_pbar + ) + + for _ in range(num_warmups): + request_task = asyncio.create_task(warmup_limited_request_func()) + warmup_tasks.append(request_task) + _ = await asyncio.gather(*warmup_tasks) + + if warmup_pbar is not None: + warmup_pbar.close() + print("Warmup run completed.") + + print("Starting main benchmark run...") + if lora_modules: # For each input request, choose a LoRA module at random. lora_modules = iter( @@ -1029,6 +1057,12 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Specify to disable tqdm progress bar.", ) + parser.add_argument( + "--num-warmups", + type=int, + default=0, + help="Number of warmup requests.", + ) parser.add_argument( "--profile", action="store_true", @@ -1370,6 +1404,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: request_rate=args.request_rate, burstiness=args.burstiness, disable_tqdm=args.disable_tqdm, + num_warmups=args.num_warmups, profile=args.profile, selected_percentile_metrics=percentile_metrics.split(","), selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],