[CLI] add --max-tokens to vllm complete (#28109)

Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
This commit is contained in:
Iceber Gu 2025-11-07 20:21:40 +08:00 committed by GitHub
parent 72b1c2ae2c
commit e0d6b4a867
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -195,10 +195,15 @@ class CompleteCommand(CLISubcommand):
def cmd(args: argparse.Namespace) -> None:
model_name, client = _interactive_cli(args)
kwargs = {
"model": model_name,
"stream": True,
}
if args.max_tokens:
kwargs["max_tokens"] = args.max_tokens
if args.quick:
stream = client.completions.create(
model=model_name, prompt=args.quick, stream=True
)
stream = client.completions.create(prompt=args.quick, **kwargs)
_print_completion_stream(stream)
return
@ -208,15 +213,18 @@ class CompleteCommand(CLISubcommand):
input_prompt = input("> ")
except EOFError:
break
stream = client.completions.create(
model=model_name, prompt=input_prompt, stream=True
)
stream = client.completions.create(prompt=input_prompt, **kwargs)
_print_completion_stream(stream)
@staticmethod
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"""Add CLI arguments for the complete command."""
_add_query_options(parser)
parser.add_argument(
"--max-tokens",
type=int,
help="Maximum number of tokens to generate per output sequence.",
)
parser.add_argument(
"-q",
"--quick",