mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-19 14:07:00 +08:00
Adding Warmup to Benchmark Serving (#26943)
Signed-off-by: Kimbo Chen <chentenghung@gmail.com>
This commit is contained in:
parent
a5464dcf92
commit
013abde6ef
@ -478,6 +478,7 @@ async def benchmark(
|
|||||||
request_rate: float,
|
request_rate: float,
|
||||||
burstiness: float,
|
burstiness: float,
|
||||||
disable_tqdm: bool,
|
disable_tqdm: bool,
|
||||||
|
num_warmups: int,
|
||||||
profile: bool,
|
profile: bool,
|
||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[float],
|
selected_percentiles: list[float],
|
||||||
@ -559,10 +560,37 @@ async def benchmark(
|
|||||||
f"Error: {test_output.error}"
|
f"Error: {test_output.error}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print("Initial test run completed. Starting main benchmark run...")
|
print("Initial test run completed.")
|
||||||
else:
|
else:
|
||||||
print("Skipping endpoint ready check.")
|
print("Skipping endpoint ready check.")
|
||||||
|
|
||||||
|
if num_warmups > 0:
|
||||||
|
print(f"Warming up with {num_warmups} requests...")
|
||||||
|
warmup_pbar = None if disable_tqdm else tqdm(total=num_warmups)
|
||||||
|
warmup_semaphore = (
|
||||||
|
asyncio.Semaphore(max_concurrency)
|
||||||
|
if max_concurrency
|
||||||
|
else contextlib.nullcontext()
|
||||||
|
)
|
||||||
|
warmup_tasks = []
|
||||||
|
|
||||||
|
async def warmup_limited_request_func():
|
||||||
|
async with warmup_semaphore:
|
||||||
|
return await request_func(
|
||||||
|
request_func_input=test_input, session=session, pbar=warmup_pbar
|
||||||
|
)
|
||||||
|
|
||||||
|
for _ in range(num_warmups):
|
||||||
|
request_task = asyncio.create_task(warmup_limited_request_func())
|
||||||
|
warmup_tasks.append(request_task)
|
||||||
|
_ = await asyncio.gather(*warmup_tasks)
|
||||||
|
|
||||||
|
if warmup_pbar is not None:
|
||||||
|
warmup_pbar.close()
|
||||||
|
print("Warmup run completed.")
|
||||||
|
|
||||||
|
print("Starting main benchmark run...")
|
||||||
|
|
||||||
if lora_modules:
|
if lora_modules:
|
||||||
# For each input request, choose a LoRA module at random.
|
# For each input request, choose a LoRA module at random.
|
||||||
lora_modules = iter(
|
lora_modules = iter(
|
||||||
@ -1029,6 +1057,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Specify to disable tqdm progress bar.",
|
help="Specify to disable tqdm progress bar.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-warmups",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Number of warmup requests.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--profile",
|
"--profile",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
@ -1370,6 +1404,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
|||||||
request_rate=args.request_rate,
|
request_rate=args.request_rate,
|
||||||
burstiness=args.burstiness,
|
burstiness=args.burstiness,
|
||||||
disable_tqdm=args.disable_tqdm,
|
disable_tqdm=args.disable_tqdm,
|
||||||
|
num_warmups=args.num_warmups,
|
||||||
profile=args.profile,
|
profile=args.profile,
|
||||||
selected_percentile_metrics=percentile_metrics.split(","),
|
selected_percentile_metrics=percentile_metrics.split(","),
|
||||||
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
|
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user