diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index 99a8759c84f4..fb49be370203 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -195,10 +195,15 @@ class CompleteCommand(CLISubcommand): def cmd(args: argparse.Namespace) -> None: model_name, client = _interactive_cli(args) + kwargs = { + "model": model_name, + "stream": True, + } + if args.max_tokens: + kwargs["max_tokens"] = args.max_tokens + if args.quick: - stream = client.completions.create( - model=model_name, prompt=args.quick, stream=True - ) + stream = client.completions.create(prompt=args.quick, **kwargs) _print_completion_stream(stream) return @@ -208,15 +213,18 @@ class CompleteCommand(CLISubcommand): input_prompt = input("> ") except EOFError: break - stream = client.completions.create( - model=model_name, prompt=input_prompt, stream=True - ) + stream = client.completions.create(prompt=input_prompt, **kwargs) _print_completion_stream(stream) @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """Add CLI arguments for the complete command.""" _add_query_options(parser) + parser.add_argument( + "--max-tokens", + type=int, + help="Maximum number of tokens to generate per output sequence.", + ) parser.add_argument( "-q", "--quick",