Add full serve CLI reference back to docs (#20978)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-07-15 18:42:30 +01:00 committed by GitHub
parent 1e36c8687e
commit b637e9dcb8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 58 additions and 35 deletions

View File

@ -1,3 +1,7 @@
---
toc_depth: 4
---
# vLLM CLI Guide # vLLM CLI Guide
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with: The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
@ -42,6 +46,10 @@ Start the vLLM OpenAI Compatible API server.
vllm serve --help=page vllm serve --help=page
``` ```
### Options
--8<-- "docs/argparse/serve.md"
## chat ## chat
Generate chat completions via the running API server. Generate chat completions via the running API server.

View File

@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
## CLI Arguments ## CLI Arguments
The `vllm serve` command is used to launch the OpenAI-compatible server. The `vllm serve` command is used to launch the OpenAI-compatible server.
To see the available CLI arguments, run `vllm serve --help`! To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
## Configuration file ## Configuration file

View File

@ -16,6 +16,7 @@ sys.modules["blake3"] = MagicMock()
sys.modules["vllm._C"] = MagicMock() sys.modules["vllm._C"] = MagicMock()
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
from vllm.utils import FlexibleArgumentParser # noqa: E402 from vllm.utils import FlexibleArgumentParser # noqa: E402
logger = logging.getLogger("mkdocs") logger = logging.getLogger("mkdocs")
@ -24,15 +25,18 @@ logger = logging.getLogger("mkdocs")
class MarkdownFormatter(HelpFormatter): class MarkdownFormatter(HelpFormatter):
"""Custom formatter that generates markdown for argument groups.""" """Custom formatter that generates markdown for argument groups."""
def __init__(self, prog): def __init__(self, prog, starting_heading_level=3):
super().__init__(prog, super().__init__(prog,
max_help_position=float('inf'), max_help_position=float('inf'),
width=float('inf')) width=float('inf'))
self._section_heading_prefix = "#" * starting_heading_level
self._argument_heading_prefix = "#" * (starting_heading_level + 1)
self._markdown_output = [] self._markdown_output = []
def start_section(self, heading): def start_section(self, heading):
if heading not in {"positional arguments", "options"}: if heading not in {"positional arguments", "options"}:
self._markdown_output.append(f"\n### {heading}\n\n") heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
self._markdown_output.append(heading_md)
def end_section(self): def end_section(self):
pass pass
@ -46,9 +50,13 @@ class MarkdownFormatter(HelpFormatter):
def add_arguments(self, actions): def add_arguments(self, actions):
for action in actions: for action in actions:
if (len(action.option_strings) == 0
or "--help" in action.option_strings):
continue
option_strings = f'`{"`, `".join(action.option_strings)}`' option_strings = f'`{"`, `".join(action.option_strings)}`'
self._markdown_output.append(f"#### {option_strings}\n\n") heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
self._markdown_output.append(heading_md)
if choices := action.choices: if choices := action.choices:
choices = f'`{"`, `".join(str(c) for c in choices)}`' choices = f'`{"`, `".join(str(c) for c in choices)}`'
@ -81,6 +89,14 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
return cls.add_cli_args(parser, **kwargs) return cls.add_cli_args(parser, **kwargs)
def create_serve_parser() -> FlexibleArgumentParser:
"""Create a parser for the serve command with markdown formatting."""
parser = FlexibleArgumentParser()
parser.formatter_class = lambda prog: MarkdownFormatter(
prog, starting_heading_level=4)
return make_arg_parser(parser)
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
logger.info("Generating argparse documentation") logger.info("Generating argparse documentation")
logger.debug("Root directory: %s", ROOT_DIR.resolve()) logger.debug("Root directory: %s", ROOT_DIR.resolve())
@ -95,6 +111,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
"engine_args": create_parser(EngineArgs), "engine_args": create_parser(EngineArgs),
"async_engine_args": create_parser(AsyncEngineArgs, "async_engine_args": create_parser(AsyncEngineArgs,
async_args_only=True), async_args_only=True),
"serve": create_serve_parser(),
} }
# Generate documentation for each parser # Generate documentation for each parser

View File

@ -17,6 +17,7 @@ cloudpickle
fastapi fastapi
msgspec msgspec
openai openai
partial-json-parser
pillow pillow
psutil psutil
pybase64 pybase64

View File

@ -67,37 +67,6 @@ class ServeSubcommand(CLISubcommand):
help="Start the vLLM OpenAI Compatible API server.", help="Start the vLLM OpenAI Compatible API server.",
description="Start the vLLM OpenAI Compatible API server.", description="Start the vLLM OpenAI Compatible API server.",
usage="vllm serve [model_tag] [options]") usage="vllm serve [model_tag] [options]")
serve_parser.add_argument("model_tag",
type=str,
nargs='?',
help="The model tag to serve "
"(optional if specified in config)")
serve_parser.add_argument(
"--headless",
action='store_true',
default=False,
help="Run in headless mode. See multi-node data parallel "
"documentation for more details.")
serve_parser.add_argument(
'--data-parallel-start-rank',
'-dpr',
type=int,
default=0,
help="Starting data parallel rank for secondary nodes. "
"Requires --headless.")
serve_parser.add_argument('--api-server-count',
'-asc',
type=int,
default=1,
help='How many API server processes to run.')
serve_parser.add_argument(
"--config",
type=str,
default='',
required=False,
help="Read CLI options from a config file. "
"Must be a YAML with the following options: "
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
serve_parser = make_arg_parser(serve_parser) serve_parser = make_arg_parser(serve_parser)
show_filtered_argument_or_group_from_help(serve_parser, ["serve"]) show_filtered_argument_or_group_from_help(serve_parser, ["serve"])

View File

@ -236,6 +236,34 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
register all arguments instead of manually enumerating them here. This register all arguments instead of manually enumerating them here. This
avoids code duplication and keeps the argument definitions in one place. avoids code duplication and keeps the argument definitions in one place.
""" """
parser.add_argument("model_tag",
type=str,
nargs="?",
help="The model tag to serve "
"(optional if specified in config)")
parser.add_argument(
"--headless",
action="store_true",
default=False,
help="Run in headless mode. See multi-node data parallel "
"documentation for more details.")
parser.add_argument(
"--data-parallel-start-rank",
"-dpr",
type=int,
default=0,
help="Starting data parallel rank for secondary nodes. "
"Requires --headless.")
parser.add_argument("--api-server-count",
"-asc",
type=int,
default=1,
help="How many API server processes to run.")
parser.add_argument(
"--config",
help="Read CLI options from a config file. "
"Must be a YAML with the following options: "
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
parser = FrontendArgs.add_cli_args(parser) parser = FrontendArgs.add_cli_args(parser)
parser = AsyncEngineArgs.add_cli_args(parser) parser = AsyncEngineArgs.add_cli_args(parser)