From e0d6b4a867bd53edb33496756913c30480029ba8 Mon Sep 17 00:00:00 2001
From: Iceber Gu <caiwei95@hotmail.com>
Date: Fri, 7 Nov 2025 20:21:40 +0800
Subject: [PATCH] [CLI] add --max-tokens to `vllm complete` (#28109)

Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
---
 vllm/entrypoints/cli/openai.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 99a8759c84f4..fb49be370203 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -195,10 +195,15 @@ class CompleteCommand(CLISubcommand):
     def cmd(args: argparse.Namespace) -> None:
         model_name, client = _interactive_cli(args)
 
+        kwargs = {
+            "model": model_name,
+            "stream": True,
+        }
+        if args.max_tokens:
+            kwargs["max_tokens"] = args.max_tokens
+
         if args.quick:
-            stream = client.completions.create(
-                model=model_name, prompt=args.quick, stream=True
-            )
+            stream = client.completions.create(prompt=args.quick, **kwargs)
             _print_completion_stream(stream)
             return
 
@@ -208,15 +213,18 @@ class CompleteCommand(CLISubcommand):
                 input_prompt = input("> ")
             except EOFError:
                 break
-            stream = client.completions.create(
-                model=model_name, prompt=input_prompt, stream=True
-            )
+            stream = client.completions.create(prompt=input_prompt, **kwargs)
             _print_completion_stream(stream)
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Add CLI arguments for the complete command."""
         _add_query_options(parser)
+        parser.add_argument(
+            "--max-tokens",
+            type=int,
+            help="Maximum number of tokens to generate per output sequence.",
+        )
         parser.add_argument(
             "-q",
             "--quick",