diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 6a7db920b5b6..09c8e23ebb1c 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -14,7 +14,8 @@ from tqdm.asyncio import tqdm from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) -from vllm.model_executor.model_loader.weight_utils import get_lock +# NOTE(simon): do not import vLLM here so the benchmark script +# can run without vLLM installed. AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -427,6 +428,8 @@ def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download + from vllm.model_executor.model_loader.weight_utils import get_lock + # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. with get_lock(pretrained_model_name_or_path): diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1dd01ca96867..47627126b668 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -684,6 +684,15 @@ def main(args: argparse.Namespace): "Invalid metadata format. Please use KEY=VALUE format." ) + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", "output_lens", "ttfts", "itls", + "generated_texts", "errors" + ]: + if field in result_json: + del result_json[field] + # Traffic result_json["request_rate"] = (args.request_rate if args.request_rate < float("inf") else "inf") @@ -828,6 +837,12 @@ if __name__ == "__main__": action="store_true", help="Specify to save benchmark results to a json file", ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) parser.add_argument( "--metadata", metavar="KEY=VALUE",