[Misc]Add param max-model-len in benchmark_latency.py (#5629)

2026-01-26 13:04:41 +08:00 · 2024-06-19 18:19:08 +08:00 · 2024-06-19 18:19:08 +08:00 · d8714530d1
commit d8714530d1
parent 7d46c8d378
1 changed files with 7 additions and 0 deletions
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -29,6 +29,7 @@ def main(args: argparse.Namespace):
        tensor_parallel_size=args.tensor_parallel_size,
        trust_remote_code=args.trust_remote_code,
        dtype=args.dtype,
+        max_model_len=args.max_model_len,
        enforce_eager=args.enforce_eager,
        kv_cache_dtype=args.kv_cache_dtype,
        quantization_param_path=args.quantization_param_path,
@ -150,6 +151,12 @@ if __name__ == '__main__':
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
    parser.add_argument(
        '--dtype',
        type=str,