From e0d6b4a867bd53edb33496756913c30480029ba8 Mon Sep 17 00:00:00 2001 From: Iceber Gu Date: Fri, 7 Nov 2025 20:21:40 +0800 Subject: [PATCH] [CLI] add --max-tokens to `vllm complete` (#28109) Signed-off-by: Iceber Gu --- vllm/entrypoints/cli/openai.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index 99a8759c84f4..fb49be370203 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -195,10 +195,15 @@ class CompleteCommand(CLISubcommand): def cmd(args: argparse.Namespace) -> None: model_name, client = _interactive_cli(args) + kwargs = { + "model": model_name, + "stream": True, + } + if args.max_tokens: + kwargs["max_tokens"] = args.max_tokens + if args.quick: - stream = client.completions.create( - model=model_name, prompt=args.quick, stream=True - ) + stream = client.completions.create(prompt=args.quick, **kwargs) _print_completion_stream(stream) return @@ -208,15 +213,18 @@ class CompleteCommand(CLISubcommand): input_prompt = input("> ") except EOFError: break - stream = client.completions.create( - model=model_name, prompt=input_prompt, stream=True - ) + stream = client.completions.create(prompt=input_prompt, **kwargs) _print_completion_stream(stream) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """Add CLI arguments for the complete command.""" _add_query_options(parser) + parser.add_argument( + "--max-tokens", + type=int, + help="Maximum number of tokens to generate per output sequence.", + ) parser.add_argument( "-q", "--quick",