diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index 5c6124db80b4f..cebdf56c45b1b 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -14,8 +14,8 @@ from tqdm import tqdm import vllm.envs as envs from vllm import LLM, SamplingParams -from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, - write_to_json) +from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, + write_to_json) from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType from vllm.sampling_params import BeamSearchParams diff --git a/vllm/benchmarks/lib/__init__.py b/vllm/benchmarks/lib/__init__.py new file mode 100644 index 0000000000000..005e87af61949 --- /dev/null +++ b/vllm/benchmarks/lib/__init__.py @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark library utilities.""" diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py similarity index 100% rename from vllm/benchmarks/endpoint_request_func.py rename to vllm/benchmarks/lib/endpoint_request_func.py diff --git a/vllm/benchmarks/lib/ready_checker.py b/vllm/benchmarks/lib/ready_checker.py new file mode 100644 index 0000000000000..a663f85b629d2 --- /dev/null +++ b/vllm/benchmarks/lib/ready_checker.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utilities for checking endpoint readiness.""" + +import asyncio +import time + +import aiohttp +from tqdm.asyncio import tqdm + +from .endpoint_request_func import RequestFuncInput, RequestFuncOutput + + +async def wait_for_endpoint( + request_func, + test_input: RequestFuncInput, + timeout_seconds: int = 600, + retry_interval: int = 5, +) -> RequestFuncOutput: + """ + Wait for an endpoint to become available before starting benchmarks. + + Args: + request_func: The async request function to call + test_input: The RequestFuncInput to test with + timeout_seconds: Maximum time to wait in seconds (default: 10 minutes) + retry_interval: Time between retries in seconds (default: 5 seconds) + + Returns: + RequestFuncOutput: The successful response + + Raises: + ValueError: If the endpoint doesn't become available within the timeout + """ + deadline = time.perf_counter() + timeout_seconds + output = RequestFuncOutput(success=False) + print(f"Waiting for endpoint to become up in {timeout_seconds} seconds") + + with tqdm( + total=timeout_seconds, + bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining", + unit="s", + ) as pbar: + + while True: + # update progress bar + remaining = deadline - time.perf_counter() + elapsed = timeout_seconds - remaining + update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n) + pbar.update(update_amount) + pbar.refresh() + if remaining <= 0: + pbar.close() + break + + # ping the endpoint using request_func + try: + output = await request_func(request_func_input=test_input) + if output.success: + pbar.close() + return output + except aiohttp.ClientConnectorError: + pass + + # retry after a delay + sleep_duration = min(retry_interval, remaining) + if sleep_duration > 0: + await asyncio.sleep(sleep_duration) + + return output diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/lib/utils.py similarity index 100% rename from vllm/benchmarks/utils.py rename to vllm/benchmarks/lib/utils.py diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index bd2b1e5990c83..45798547ac719 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -34,12 +34,12 @@ from transformers import PreTrainedTokenizerBase from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser, get_samples) -from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS, - OPENAI_COMPATIBLE_BACKENDS, - RequestFuncInput, - RequestFuncOutput) -from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, - write_to_json) +from vllm.benchmarks.lib.endpoint_request_func import ( + ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, + RequestFuncOutput) +from vllm.benchmarks.lib.ready_checker import wait_for_endpoint +from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, + write_to_json) from vllm.transformers_utils.tokenizer import get_tokenizer MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -331,6 +331,7 @@ async def benchmark( ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None, ramp_up_start_rps: Optional[int] = None, ramp_up_end_rps: Optional[int] = None, + ready_check_timeout_sec: int = 600, ): if endpoint_type in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[endpoint_type] @@ -359,7 +360,8 @@ async def benchmark( extra_body=extra_body, ) - test_output = await request_func(request_func_input=test_input) + test_output = await wait_for_endpoint( + request_func, test_input, timeout_seconds=ready_check_timeout_sec) if not test_output.success: raise ValueError( "Initial test run failed - Please make sure benchmark arguments " @@ -907,6 +909,13 @@ def add_cli_args(parser: argparse.ArgumentParser): help="The ending request rate for ramp-up (RPS). " "Needs to be specified when --ramp-up-strategy is used.", ) + parser.add_argument( + "--ready-check-timeout-sec", + type=int, + default=600, + help="Maximum time to wait for the endpoint to become ready " + "in seconds (default: 600 seconds / 10 minutes).", + ) def main(args: argparse.Namespace): @@ -1012,6 +1021,7 @@ def main(args: argparse.Namespace): ramp_up_strategy=args.ramp_up_strategy, ramp_up_start_rps=args.ramp_up_start_rps, ramp_up_end_rps=args.ramp_up_end_rps, + ready_check_timeout_sec=args.ready_check_timeout_sec, )) # Save config and results to json diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 0fe042e2736da..bbd18ca3ae22e 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -21,8 +21,8 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset, InstructCoderDataset, RandomDataset, SampleRequest, ShareGPTDataset, SonnetDataset, VisionArenaDataset) -from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, - write_to_json) +from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, + write_to_json) from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args)