diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 74235db16a15d..3a93497fab137 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -126,6 +126,7 @@ curl http://localhost:8000/v1/models ``` You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. +You can pass multiple keys after `--api-key`, and the server will accept any of the keys passed, this can be useful for key rotation. ### OpenAI Completions API with vLLM diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c375c8755108c..05d9a69a65f83 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1239,9 +1239,9 @@ class AuthenticationMiddleware: 2. The request path doesn't start with /v1 (e.g. /health). """ - def __init__(self, app: ASGIApp, api_token: str) -> None: + def __init__(self, app: ASGIApp, tokens: list[str]) -> None: self.app = app - self.api_token = api_token + self.api_tokens = {f"Bearer {token}" for token in tokens} def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]: @@ -1255,7 +1255,7 @@ class AuthenticationMiddleware: headers = Headers(scope=scope) # Type narrow to satisfy mypy. if url_path.startswith("/v1") and headers.get( - "Authorization") != f"Bearer {self.api_token}": + "Authorization") not in self.api_tokens: response = JSONResponse(content={"error": "Unauthorized"}, status_code=401) return response(scope, receive, send) @@ -1303,7 +1303,7 @@ class ScalingMiddleware: """ Middleware that checks if the model is currently scaling and returns a 503 Service Unavailable response if it is. - + This middleware applies to all HTTP requests and prevents processing when the model is in a scaling state. """ @@ -1512,8 +1512,8 @@ def build_app(args: Namespace) -> FastAPI: status_code=HTTPStatus.BAD_REQUEST) # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY - if token := args.api_key or envs.VLLM_API_KEY: - app.add_middleware(AuthenticationMiddleware, api_token=token) + if tokens := [key for key in (args.api_key or [envs.VLLM_API_KEY]) if key]: + app.add_middleware(AuthenticationMiddleware, tokens=tokens) if args.enable_request_id_headers: app.add_middleware(XRequestIdMiddleware) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 282493e543552..dfbc9cde3d5b1 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -85,22 +85,22 @@ class FrontendArgs: """Allowed methods.""" allowed_headers: list[str] = field(default_factory=lambda: ["*"]) """Allowed headers.""" - api_key: Optional[str] = None - """If provided, the server will require this key to be presented in the - header.""" + api_key: Optional[list[str]] = None + """If provided, the server will require one of these keys to be presented in + the header.""" lora_modules: Optional[list[LoRAModulePath]] = None """LoRA modules configurations in either 'name=path' format or JSON format - or JSON list format. Example (old format): `'name=path'` Example (new - format): `{\"name\": \"name\", \"path\": \"lora_path\", + or JSON list format. Example (old format): `'name=path'` Example (new + format): `{\"name\": \"name\", \"path\": \"lora_path\", \"base_model_name\": \"id\"}`""" chat_template: Optional[str] = None - """The file path to the chat template, or the template in single-line form + """The file path to the chat template, or the template in single-line form for the specified model.""" chat_template_content_format: ChatTemplateContentFormatOption = "auto" """The format to render message content within a chat template. * "string" will render the content as a string. Example: `"Hello World"` -* "openai" will render the content as a list of dictionaries, similar to OpenAI +* "openai" will render the content as a list of dictionaries, similar to OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" response_role: str = "assistant" """The role name to return if `request.add_generation_prompt=true`.""" @@ -117,40 +117,40 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" root_path: Optional[str] = None """FastAPI root_path when app is behind a path based routing proxy.""" middleware: list[str] = field(default_factory=lambda: []) - """Additional ASGI middleware to apply to the app. We accept multiple - --middleware arguments. The value should be an import path. If a function - is provided, vLLM will add it to the server using - `@app.middleware('http')`. If a class is provided, vLLM will + """Additional ASGI middleware to apply to the app. We accept multiple + --middleware arguments. The value should be an import path. If a function + is provided, vLLM will add it to the server using + `@app.middleware('http')`. If a class is provided, vLLM will add it to the server using `app.add_middleware()`.""" return_tokens_as_token_ids: bool = False - """When `--max-logprobs` is specified, represents single tokens as - strings of the form 'token_id:{token_id}' so that tokens that are not + """When `--max-logprobs` is specified, represents single tokens as + strings of the form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be identified.""" disable_frontend_multiprocessing: bool = False - """If specified, will run the OpenAI frontend server in the same process as + """If specified, will run the OpenAI frontend server in the same process as the model serving engine.""" enable_request_id_headers: bool = False - """If specified, API server will add X-Request-Id header to responses. + """If specified, API server will add X-Request-Id header to responses. Caution: this hurts performance at high QPS.""" enable_auto_tool_choice: bool = False - """If specified, exclude tool definitions in prompts when + """If specified, exclude tool definitions in prompts when tool_choice='none'.""" exclude_tools_when_tool_choice_none: bool = False - """Enable auto tool choice for supported models. Use `--tool-call-parser` + """Enable auto tool choice for supported models. Use `--tool-call-parser` to specify which parser to use.""" tool_call_parser: Optional[str] = None - """Select the tool call parser depending on the model that you're using. - This is used to parse the model-generated tool call into OpenAI API format. - Required for `--enable-auto-tool-choice`. You can choose any option from + """Select the tool call parser depending on the model that you're using. + This is used to parse the model-generated tool call into OpenAI API format. + Required for `--enable-auto-tool-choice`. You can choose any option from the built-in parsers or register a plugin via `--tool-parser-plugin`.""" tool_parser_plugin: str = "" - """Special the tool parser plugin write to parse the model-generated tool - into OpenAI API format, the name register in this plugin can be used in + """Special the tool parser plugin write to parse the model-generated tool + into OpenAI API format, the name register in this plugin can be used in `--tool-call-parser`.""" log_config_file: Optional[str] = envs.VLLM_LOGGING_CONFIG_PATH """Path to logging config JSON file for both vllm and uvicorn""" max_log_len: Optional[int] = None - """Max number of prompt characters or prompt ID numbers being printed in + """Max number of prompt characters or prompt ID numbers being printed in log. The default of None means unlimited.""" disable_fastapi_docs: bool = False """Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."""