[CLI] add --max-tokens to vllm complete (#28109)

Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
2025-12-10 09:35:50 +08:00 · 2025-11-07 20:21:40 +08:00 · 2025-11-07 20:21:40 +08:00 · e0d6b4a867
commit e0d6b4a867
parent 72b1c2ae2c
1 changed files with 14 additions and 6 deletions
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@ -195,10 +195,15 @@ class CompleteCommand(CLISubcommand):
    def cmd(args: argparse.Namespace) -> None:
        model_name, client = _interactive_cli(args)
        kwargs = {
            "model": model_name,
            "stream": True,
        }
        if args.max_tokens:
            kwargs["max_tokens"] = args.max_tokens
        if args.quick:
-            stream = client.completions.create(
+            stream = client.completions.create(prompt=args.quick, **kwargs)
                model=model_name, prompt=args.quick, stream=True
            )
            _print_completion_stream(stream)
            return
@ -208,15 +213,18 @@ class CompleteCommand(CLISubcommand):
                input_prompt = input("> ")
            except EOFError:
                break
-            stream = client.completions.create(
+            stream = client.completions.create(prompt=input_prompt, **kwargs)
                model=model_name, prompt=input_prompt, stream=True
            )
            _print_completion_stream(stream)
    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        """Add CLI arguments for the complete command."""
        _add_query_options(parser)
        parser.add_argument(
            "--max-tokens",
            type=int,
            help="Maximum number of tokens to generate per output sequence.",
        )
        parser.add_argument(
            "-q",
            "--quick",