[CLI] add --max-tokens to vllm complete (#28109)

Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
2026-06-08 17:15:41 +08:00 · 2025-11-07 20:21:40 +08:00 · 2025-11-07 20:21:40 +08:00 · e0d6b4a867
commit e0d6b4a867
parent 72b1c2ae2c
1 changed files with 14 additions and 6 deletions
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@ -195,10 +195,15 @@ class CompleteCommand(CLISubcommand):
    def cmd(args: argparse.Namespace) -> None:
        model_name, client = _interactive_cli(args)

+        kwargs = {
+            "model": model_name,
+            "stream": True,
+        }
+        if args.max_tokens:
+            kwargs["max_tokens"] = args.max_tokens
+
        if args.quick:
-            stream = client.completions.create(
-                model=model_name, prompt=args.quick, stream=True
-            )
+            stream = client.completions.create(prompt=args.quick, **kwargs)
            _print_completion_stream(stream)
            return

@ -208,15 +213,18 @@ class CompleteCommand(CLISubcommand):
                input_prompt = input("> ")
            except EOFError:
                break
-            stream = client.completions.create(
-                model=model_name, prompt=input_prompt, stream=True
-            )
+            stream = client.completions.create(prompt=input_prompt, **kwargs)
            _print_completion_stream(stream)

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        """Add CLI arguments for the complete command."""
        _add_query_options(parser)
+        parser.add_argument(
+            "--max-tokens",
+            type=int,
+            help="Maximum number of tokens to generate per output sequence.",
+        )
        parser.add_argument(
            "-q",
            "--quick",