From b08025a83bf416d97d0547ac52c3909356e118c4 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 2 Dec 2025 23:57:28 -0500 Subject: [PATCH 1/2] [Docs] Discuss api key limitations in security guide (#29922) Signed-off-by: Russell Bryant --- docs/usage/security.md | 110 +++++++++++++++++++++++++++++++++ vllm/entrypoints/cli/openai.py | 4 ++ 2 files changed, 114 insertions(+) diff --git a/docs/usage/security.md b/docs/usage/security.md index 9d10b66a5a97f..74060d86f6854 100644 --- a/docs/usage/security.md +++ b/docs/usage/security.md @@ -108,6 +108,116 @@ networks. Consult your operating system or application platform documentation for specific firewall configuration instructions. +## API Key Authentication Limitations + +### Overview + +The `--api-key` flag (or `VLLM_API_KEY` environment variable) provides authentication for vLLM's HTTP server, but **only for OpenAI-compatible API endpoints under the `/v1` path prefix**. Many other sensitive endpoints are exposed on the same HTTP server without any authentication enforcement. + +**Important:** Do not rely exclusively on `--api-key` for securing access to vLLM. Additional security measures are required for production deployments. + +### Protected Endpoints (Require API Key) + +When `--api-key` is configured, the following `/v1` endpoints require Bearer token authentication: + +- `/v1/models` - List available models +- `/v1/chat/completions` - Chat completions +- `/v1/completions` - Text completions +- `/v1/embeddings` - Generate embeddings +- `/v1/audio/transcriptions` - Audio transcription +- `/v1/audio/translations` - Audio translation +- `/v1/messages` - Anthropic-compatible messages API +- `/v1/responses` - Response management +- `/v1/score` - Scoring API +- `/v1/rerank` - Reranking API + +### Unprotected Endpoints (No API Key Required) + +The following endpoints **do not require authentication** even when `--api-key` is configured: + +**Inference endpoints:** + +- `/invocations` - SageMaker-compatible endpoint (routes to the same inference functions as `/v1` endpoints) +- `/inference/v1/generate` - Generate completions +- `/pooling` - Pooling API +- `/classify` - Classification API +- `/score` - Scoring API (non-`/v1` variant) +- `/rerank` - Reranking API (non-`/v1` variant) + +**Operational control endpoints (always enabled):** + +- `/pause` - Pause generation (causes denial of service) +- `/resume` - Resume generation +- `/scale_elastic_ep` - Trigger scaling operations + +**Utility endpoints:** + +- `/tokenize` - Tokenize text +- `/detokenize` - Detokenize tokens +- `/health` - Health check +- `/ping` - SageMaker health check +- `/version` - Version information +- `/load` - Server load metrics + +**Tokenizer information endpoint (only when `--enable-tokenizer-info-endpoint` is set):** + +This endpoint is **only available when the `--enable-tokenizer-info-endpoint` flag is set**. It may expose sensitive information such as chat templates and tokenizer configuration: + +- `/tokenizer_info` - Get comprehensive tokenizer information including chat templates and configuration + +**Development endpoints (only when `VLLM_SERVER_DEV_MODE=1`):** + +These endpoints are **only available when the environment variable `VLLM_SERVER_DEV_MODE` is set to `1`**. They are intended for development and debugging purposes and should never be enabled in production: + +- `/server_info` - Get detailed server configuration +- `/reset_prefix_cache` - Reset prefix cache (can disrupt service) +- `/reset_mm_cache` - Reset multimodal cache (can disrupt service) +- `/sleep` - Put engine to sleep (causes denial of service) +- `/wake_up` - Wake engine from sleep +- `/is_sleeping` - Check if engine is sleeping +- `/collective_rpc` - Execute arbitrary RPC methods on the engine (extremely dangerous) + +**Profiler endpoints (only when `VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE` are set):** + +These endpoints are only available when profiling is enabled and should only be used for local development: + +- `/start_profile` - Start PyTorch profiler +- `/stop_profile` - Stop PyTorch profiler + +**Note:** The `/invocations` endpoint is particularly concerning as it provides unauthenticated access to the same inference capabilities as the protected `/v1` endpoints. + +### Security Implications + +An attacker who can reach the vLLM HTTP server can: + +1. **Bypass authentication** by using non-`/v1` endpoints like `/invocations`, `/inference/v1/generate`, `/pooling`, `/classify`, `/score`, or `/rerank` to run arbitrary inference without credentials +2. **Cause denial of service** by calling `/pause` or `/scale_elastic_ep` without a token +3. **Access operational controls** to manipulate server state (e.g., pausing generation) +4. **If `--enable-tokenizer-info-endpoint` is set:** Access sensitive tokenizer configuration including chat templates, which may reveal prompt engineering strategies or other implementation details +5. **If `VLLM_SERVER_DEV_MODE=1` is set:** Execute arbitrary RPC commands via `/collective_rpc`, reset caches, put the engine to sleep, and access detailed server configuration + +### Recommended Security Practices + +#### 1. Minimize Exposed Endpoints + +**CRITICAL:** Never set `VLLM_SERVER_DEV_MODE=1` in production environments. Development endpoints expose extremely dangerous functionality including: + +- Arbitrary RPC execution via `/collective_rpc` +- Cache manipulation that can disrupt service +- Detailed server configuration disclosure + +Similarly, never enable profiler endpoints (`VLLM_TORCH_PROFILER_DIR` or `VLLM_TORCH_CUDA_PROFILE`) in production. + +**Be cautious with `--enable-tokenizer-info-endpoint`:** Only enable the `/tokenizer_info` endpoint if you need to expose tokenizer configuration information. This endpoint reveals chat templates and tokenizer settings that may contain sensitive implementation details or prompt engineering strategies. + +#### 2. Deploy Behind a Reverse Proxy + +The most effective approach is to deploy vLLM behind a reverse proxy (such as nginx, Envoy, or a Kubernetes Gateway) that: + +- Explicitly allowlists only the endpoints you want to expose to end users +- Blocks all other endpoints, including the unauthenticated inference and operational control endpoints +- Implements additional authentication, rate limiting, and logging at the proxy layer + ## Reporting Security Vulnerabilities If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md). diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index fb49be370203e..1c18b193d1cdc 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -109,6 +109,10 @@ def _add_query_options(parser: FlexibleArgumentParser) -> FlexibleArgumentParser help=( "API key for OpenAI services. If provided, this api key " "will overwrite the api key obtained through environment variables." + " It is important to note that this option only applies to the " + "OpenAI-compatible API endpoints and NOT other endpoints that may " + "be present in the server. See the security guide in the vLLM docs " + "for more details." ), ) return parser From c719c40540a85c1e6aeee9af20f29db581da27f0 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Wed, 3 Dec 2025 13:15:50 +0800 Subject: [PATCH 2/2] [Bugfix] Defunctionalize TRTLLM AR+Norm op for avoiding extra clone kernel before it (#29631) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Signed-off-by: Luka Govedič Co-authored-by: Luka Govedič --- vllm/compilation/fix_functionalization.py | 12 ++++++++++++ vllm/compilation/fx_utils.py | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 126ad35e527ae..76068f86ebfb3 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -103,6 +103,18 @@ class FixFunctionalizationPass(VllmInductorPass): ]: mutated_args = {1: "result"} self.defunctionalize(graph, node, mutated_args) + elif ( + at_target + == torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default + ): + mutated_args = { + 1: "allreduce_in", + 2: "residual", + 3: "norm_out", + 4: "quant_out", + 5: "scale_out", + } + self.defunctionalize(graph, node, mutated_args) # For some reason we need to specify the args for both # silu_and_mul and silu_and_mul_quant. The kwargs # pathway gets the wrong answer. diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py index f2497950fc22f..3650ee6b41745 100644 --- a/vllm/compilation/fx_utils.py +++ b/vllm/compilation/fx_utils.py @@ -75,8 +75,8 @@ def find_op_nodes( return assert isinstance(op, OpOverload) - if not op._schema.is_mutable: - yield from graph.find_nodes(op="call_function", target=op) + + yield from graph.find_nodes(op="call_function", target=op) for n in graph.find_nodes(op="call_function", target=auto_functionalized): if n.args[0] == op: