Add full serve CLI reference back to docs (#20978)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-06-01 01:37:04 +08:00 · 2025-07-15 18:42:30 +01:00 · 2025-07-15 18:42:30 +01:00 · b637e9dcb8
commit b637e9dcb8
parent 1e36c8687e
6 changed files with 58 additions and 35 deletions
--- a/docs/cli/README.md
+++ b/docs/cli/README.md
@ -1,3 +1,7 @@
 ---
 toc_depth: 4
 ---
 # vLLM CLI Guide
 The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
@ -42,6 +46,10 @@ Start the vLLM OpenAI Compatible API server.
    vllm serve --help=page
    ```
 ### Options
 --8<-- "docs/argparse/serve.md"
 ## chat
 Generate chat completions via the running API server.
--- a/docs/configuration/serve_args.md
+++ b/docs/configuration/serve_args.md
@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
 ## CLI Arguments
 The `vllm serve` command is used to launch the OpenAI-compatible server.
-To see the available CLI arguments, run `vllm serve --help`!
+To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
 ## Configuration file
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@ -16,6 +16,7 @@ sys.modules["blake3"] = MagicMock()
 sys.modules["vllm._C"] = MagicMock()
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
 from vllm.entrypoints.openai.cli_args import make_arg_parser  # noqa: E402
 from vllm.utils import FlexibleArgumentParser  # noqa: E402
 logger = logging.getLogger("mkdocs")
@ -24,15 +25,18 @@ logger = logging.getLogger("mkdocs")
 class MarkdownFormatter(HelpFormatter):
    """Custom formatter that generates markdown for argument groups."""
-    def __init__(self, prog):
+    def __init__(self, prog, starting_heading_level=3):
        super().__init__(prog,
                         max_help_position=float('inf'),
                         width=float('inf'))
        self._section_heading_prefix = "#" * starting_heading_level
        self._argument_heading_prefix = "#" * (starting_heading_level + 1)
        self._markdown_output = []
    def start_section(self, heading):
        if heading not in {"positional arguments", "options"}:
-            self._markdown_output.append(f"\n### {heading}\n\n")
+            heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
            self._markdown_output.append(heading_md)
    def end_section(self):
        pass
@ -46,9 +50,13 @@ class MarkdownFormatter(HelpFormatter):
    def add_arguments(self, actions):
        for action in actions:
            if (len(action.option_strings) == 0
                    or "--help" in action.option_strings):
                continue
            option_strings = f'`{"`, `".join(action.option_strings)}`'
-            self._markdown_output.append(f"#### {option_strings}\n\n")
+            heading_md = f"{self._argument_heading_prefix} {option_strings}\n\n"
            self._markdown_output.append(heading_md)
            if choices := action.choices:
                choices = f'`{"`, `".join(str(c) for c in choices)}`'
@ -81,6 +89,14 @@ def create_parser(cls, **kwargs) -> FlexibleArgumentParser:
        return cls.add_cli_args(parser, **kwargs)
 def create_serve_parser() -> FlexibleArgumentParser:
    """Create a parser for the serve command with markdown formatting."""
    parser = FlexibleArgumentParser()
    parser.formatter_class = lambda prog: MarkdownFormatter(
        prog, starting_heading_level=4)
    return make_arg_parser(parser)
 def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
    logger.info("Generating argparse documentation")
    logger.debug("Root directory: %s", ROOT_DIR.resolve())
@ -95,6 +111,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
        "engine_args": create_parser(EngineArgs),
        "async_engine_args": create_parser(AsyncEngineArgs,
                                           async_args_only=True),
        "serve": create_serve_parser(),
    }
    # Generate documentation for each parser
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@ -17,6 +17,7 @@ cloudpickle
 fastapi
 msgspec
 openai
 partial-json-parser
 pillow
 psutil
 pybase64
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@ -67,37 +67,6 @@ class ServeSubcommand(CLISubcommand):
            help="Start the vLLM OpenAI Compatible API server.",
            description="Start the vLLM OpenAI Compatible API server.",
            usage="vllm serve [model_tag] [options]")
        serve_parser.add_argument("model_tag",
                                  type=str,
                                  nargs='?',
                                  help="The model tag to serve "
                                  "(optional if specified in config)")
        serve_parser.add_argument(
            "--headless",
            action='store_true',
            default=False,
            help="Run in headless mode. See multi-node data parallel "
            "documentation for more details.")
        serve_parser.add_argument(
            '--data-parallel-start-rank',
            '-dpr',
            type=int,
            default=0,
            help="Starting data parallel rank for secondary nodes. "
            "Requires --headless.")
        serve_parser.add_argument('--api-server-count',
                                  '-asc',
                                  type=int,
                                  default=1,
                                  help='How many API server processes to run.')
        serve_parser.add_argument(
            "--config",
            type=str,
            default='',
            required=False,
            help="Read CLI options from a config file. "
            "Must be a YAML with the following options: "
            "https://docs.vllm.ai/en/latest/configuration/serve_args.html")
        serve_parser = make_arg_parser(serve_parser)
        show_filtered_argument_or_group_from_help(serve_parser, ["serve"])
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@ -236,6 +236,34 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
    register all arguments instead of manually enumerating them here. This
    avoids code duplication and keeps the argument definitions in one place.
    """
    parser.add_argument("model_tag",
                        type=str,
                        nargs="?",
                        help="The model tag to serve "
                        "(optional if specified in config)")
    parser.add_argument(
        "--headless",
        action="store_true",
        default=False,
        help="Run in headless mode. See multi-node data parallel "
        "documentation for more details.")
    parser.add_argument(
        "--data-parallel-start-rank",
        "-dpr",
        type=int,
        default=0,
        help="Starting data parallel rank for secondary nodes. "
        "Requires --headless.")
    parser.add_argument("--api-server-count",
                        "-asc",
                        type=int,
                        default=1,
                        help="How many API server processes to run.")
    parser.add_argument(
        "--config",
        help="Read CLI options from a config file. "
        "Must be a YAML with the following options: "
        "https://docs.vllm.ai/en/latest/configuration/serve_args.html")
    parser = FrontendArgs.add_cli_args(parser)
    parser = AsyncEngineArgs.add_cli_args(parser)