mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-28 06:07:04 +08:00
[Benchmark] Support ready check timeout in vllm bench serve (#21696)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com> Co-authored-by: Roger Wang <hey@rogerw.me>
This commit is contained in:
parent
3dddbf1f25
commit
3f36c325fa
@ -14,8 +14,8 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
|
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
||||||
write_to_json)
|
write_to_json)
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.inputs import PromptType
|
from vllm.inputs import PromptType
|
||||||
from vllm.sampling_params import BeamSearchParams
|
from vllm.sampling_params import BeamSearchParams
|
||||||
|
|||||||
3
vllm/benchmarks/lib/__init__.py
Normal file
3
vllm/benchmarks/lib/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""Benchmark library utilities."""
|
||||||
70
vllm/benchmarks/lib/ready_checker.py
Normal file
70
vllm/benchmarks/lib/ready_checker.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""Utilities for checking endpoint readiness."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
|
||||||
|
from .endpoint_request_func import RequestFuncInput, RequestFuncOutput
|
||||||
|
|
||||||
|
|
||||||
|
async def wait_for_endpoint(
|
||||||
|
request_func,
|
||||||
|
test_input: RequestFuncInput,
|
||||||
|
timeout_seconds: int = 600,
|
||||||
|
retry_interval: int = 5,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
"""
|
||||||
|
Wait for an endpoint to become available before starting benchmarks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request_func: The async request function to call
|
||||||
|
test_input: The RequestFuncInput to test with
|
||||||
|
timeout_seconds: Maximum time to wait in seconds (default: 10 minutes)
|
||||||
|
retry_interval: Time between retries in seconds (default: 5 seconds)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
RequestFuncOutput: The successful response
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the endpoint doesn't become available within the timeout
|
||||||
|
"""
|
||||||
|
deadline = time.perf_counter() + timeout_seconds
|
||||||
|
output = RequestFuncOutput(success=False)
|
||||||
|
print(f"Waiting for endpoint to become up in {timeout_seconds} seconds")
|
||||||
|
|
||||||
|
with tqdm(
|
||||||
|
total=timeout_seconds,
|
||||||
|
bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining",
|
||||||
|
unit="s",
|
||||||
|
) as pbar:
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# update progress bar
|
||||||
|
remaining = deadline - time.perf_counter()
|
||||||
|
elapsed = timeout_seconds - remaining
|
||||||
|
update_amount = min(elapsed - pbar.n, timeout_seconds - pbar.n)
|
||||||
|
pbar.update(update_amount)
|
||||||
|
pbar.refresh()
|
||||||
|
if remaining <= 0:
|
||||||
|
pbar.close()
|
||||||
|
break
|
||||||
|
|
||||||
|
# ping the endpoint using request_func
|
||||||
|
try:
|
||||||
|
output = await request_func(request_func_input=test_input)
|
||||||
|
if output.success:
|
||||||
|
pbar.close()
|
||||||
|
return output
|
||||||
|
except aiohttp.ClientConnectorError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# retry after a delay
|
||||||
|
sleep_duration = min(retry_interval, remaining)
|
||||||
|
if sleep_duration > 0:
|
||||||
|
await asyncio.sleep(sleep_duration)
|
||||||
|
|
||||||
|
return output
|
||||||
@ -34,12 +34,12 @@ from transformers import PreTrainedTokenizerBase
|
|||||||
|
|
||||||
from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
|
from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
|
||||||
get_samples)
|
get_samples)
|
||||||
from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS,
|
from vllm.benchmarks.lib.endpoint_request_func import (
|
||||||
OPENAI_COMPATIBLE_BACKENDS,
|
ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
|
||||||
RequestFuncInput,
|
RequestFuncOutput)
|
||||||
RequestFuncOutput)
|
from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
|
||||||
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
|
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
||||||
write_to_json)
|
write_to_json)
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||||
@ -331,6 +331,7 @@ async def benchmark(
|
|||||||
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
|
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
|
||||||
ramp_up_start_rps: Optional[int] = None,
|
ramp_up_start_rps: Optional[int] = None,
|
||||||
ramp_up_end_rps: Optional[int] = None,
|
ramp_up_end_rps: Optional[int] = None,
|
||||||
|
ready_check_timeout_sec: int = 600,
|
||||||
):
|
):
|
||||||
if endpoint_type in ASYNC_REQUEST_FUNCS:
|
if endpoint_type in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
|
request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
|
||||||
@ -359,7 +360,8 @@ async def benchmark(
|
|||||||
extra_body=extra_body,
|
extra_body=extra_body,
|
||||||
)
|
)
|
||||||
|
|
||||||
test_output = await request_func(request_func_input=test_input)
|
test_output = await wait_for_endpoint(
|
||||||
|
request_func, test_input, timeout_seconds=ready_check_timeout_sec)
|
||||||
if not test_output.success:
|
if not test_output.success:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Initial test run failed - Please make sure benchmark arguments "
|
"Initial test run failed - Please make sure benchmark arguments "
|
||||||
@ -907,6 +909,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
|||||||
help="The ending request rate for ramp-up (RPS). "
|
help="The ending request rate for ramp-up (RPS). "
|
||||||
"Needs to be specified when --ramp-up-strategy is used.",
|
"Needs to be specified when --ramp-up-strategy is used.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ready-check-timeout-sec",
|
||||||
|
type=int,
|
||||||
|
default=600,
|
||||||
|
help="Maximum time to wait for the endpoint to become ready "
|
||||||
|
"in seconds (default: 600 seconds / 10 minutes).",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
@ -1012,6 +1021,7 @@ def main(args: argparse.Namespace):
|
|||||||
ramp_up_strategy=args.ramp_up_strategy,
|
ramp_up_strategy=args.ramp_up_strategy,
|
||||||
ramp_up_start_rps=args.ramp_up_start_rps,
|
ramp_up_start_rps=args.ramp_up_start_rps,
|
||||||
ramp_up_end_rps=args.ramp_up_end_rps,
|
ramp_up_end_rps=args.ramp_up_end_rps,
|
||||||
|
ready_check_timeout_sec=args.ready_check_timeout_sec,
|
||||||
))
|
))
|
||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
|
|||||||
@ -21,8 +21,8 @@ from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
|
|||||||
InstructCoderDataset, RandomDataset,
|
InstructCoderDataset, RandomDataset,
|
||||||
SampleRequest, ShareGPTDataset,
|
SampleRequest, ShareGPTDataset,
|
||||||
SonnetDataset, VisionArenaDataset)
|
SonnetDataset, VisionArenaDataset)
|
||||||
from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
|
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
||||||
write_to_json)
|
write_to_json)
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||||
from vllm.entrypoints.openai.api_server import (
|
from vllm.entrypoints.openai.api_server import (
|
||||||
build_async_engine_client_from_engine_args)
|
build_async_engine_client_from_engine_args)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user