[Refactor] [4/N] Move VLLM_SERVER_DEV endpoints into the serve directory (#30749)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2026-07-11 14:37:22 +08:00 · 2025-12-17 18:27:30 +08:00 · 2025-12-17 18:27:30 +08:00 · 9ad5b21710
commit 9ad5b21710
parent f284d7bd0c
19 changed files with 259 additions and 151 deletions
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
  --ignore=entrypoints/openai/test_audio.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
  --ignore=entrypoints/openai/test_lora_adapters.py \
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -128,7 +128,7 @@ steps:
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling

 - label: Entrypoints Integration Test (LLM) # 30min
  timeout_in_minutes: 40
@ -148,7 +148,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
  timeout_in_minutes: 130
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
@ -162,10 +162,28 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
  - pytest -v -s entrypoints/test_chat_utils.py

+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - pytest -v -s tool_use
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+
 - label: Entrypoints Integration Test (Pooling)
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
@ -751,17 +769,6 @@ steps:
  # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
  - pytest -s entrypoints/openai/correctness/

- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use

 #####  models test  #####

--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -114,7 +114,7 @@ steps:
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling

 - label: Entrypoints Integration Test (LLM) # 30min
  timeout_in_minutes: 40
@ -132,7 +132,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Integration Test (API Server 1) # 100min
  timeout_in_minutes: 130
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
@ -144,10 +144,26 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/
  - pytest -v -s entrypoints/test_chat_utils.py

+- label: Entrypoints Integration Test (API Server 2)
+  timeout_in_minutes: 50
+  mirror_hardwares: [amdexperimental]
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  torch_nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/sleep
+  - tests/entrypoints/rpc
+  - tests/tool_use
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/sleep
+  - PYTHONPATH=/vllm-workspace  pytest -v -s entrypoints/rpc
+  - pytest -v -s tool_use
+
 - label: Entrypoints Integration Test (Pooling)
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
@ -666,16 +682,6 @@ steps:
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/

- label: OpenAI-Compatible Tool Use # 23 min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
-
 #####  models test  #####

 - label: Basic Models Tests (Initialization)
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@ -10,7 +10,7 @@ steps:
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling

 - label: Entrypoints Integration (LLM)
  timeout_in_minutes: 40
@ -25,7 +25,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Entrypoints Integration (API Server)
+- label: Entrypoints Integration (API Server 1)
  timeout_in_minutes: 130
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
@ -34,11 +34,26 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/
  - pytest -v -s entrypoints/test_chat_utils.py


+- label: Entrypoints Integration (API Server 2)
+  timeout_in_minutes: 130
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/
+  - tests/tool_use
+  - tests/entrypoints/sleep
+  - tests/entrypoints/instrumentator
+  - tests/entrypoints/rpc
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
+  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/sleep
+  - pytest -v -s tool_use
+
 - label: Entrypoints Integration (Pooling)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/tests"
--- a/.buildkite/test_areas/tool_use.yaml
+++ b/.buildkite/test_areas/tool_use.yaml
@ -1,13 +0,0 @@
-group: Tool use
-depends_on: 
-  - image-build
-steps:
- label: OpenAI-Compatible Tool Use
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental]
-  fast_check: false
-  source_file_dependencies:
-    - vllm/
-    - tests/tool_use
-  commands:
-    - pytest -v -s tool_use
--- a/tests/entrypoints/instrumentator/init.py
+++ b/tests/entrypoints/instrumentator/init.py
--- a/tests/entrypoints/instrumentator/test_metrics.py
+++ b/tests/entrypoints/instrumentator/test_metrics.py
@ -14,11 +14,10 @@ import requests
 from prometheus_client.parser import text_string_to_metric_families
 from transformers import AutoTokenizer

+from tests.conftest import LocalAssetServer
+from tests.utils import RemoteOpenAIServer
 from vllm import version

-from ...conftest import LocalAssetServer
-from ...utils import RemoteOpenAIServer
-
 MODELS = {
    "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",
--- a/tests/entrypoints/rpc/init.py
+++ b/tests/entrypoints/rpc/init.py
--- a/tests/entrypoints/openai/test_collective_rpc.py
+++ b/tests/entrypoints/openai/test_collective_rpc.py
@ -37,7 +37,7 @@ def server():
        "--max-num-seqs",
        "128",
        "--worker-extension-cls",
-        "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
+        "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension",
    ]
    with RemoteOpenAIServer(
        MODEL_NAME,
--- a/tests/entrypoints/sleep/init.py
+++ b/tests/entrypoints/sleep/init.py
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@ -4,7 +4,7 @@
 import requests
 from prometheus_client.parser import text_string_to_metric_families

-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer

 MODEL_NAME = "meta-llama/Llama-3.2-1B"

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -17,21 +17,20 @@ from argparse import Namespace
 from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Annotated, Any, Literal
+from typing import Annotated, Any

 import model_hosting_container_standards.sagemaker as sagemaker_standards
 import pydantic
 import uvloop
-from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request
+from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, Response, StreamingResponse
+from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import URL, Headers, MutableHeaders, State
 from starlette.types import ASGIApp, Message, Receive, Scope, Send

 import vllm.envs as envs
-from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.anthropic.protocol import (
@ -639,97 +638,6 @@ async def create_translations(
    return StreamingResponse(content=generator, media_type="text/event-stream")


-if envs.VLLM_SERVER_DEV_MODE:
-    logger.warning(
-        "SECURITY WARNING: Development endpoints are enabled! "
-        "This should NOT be used in production!"
-    )
-
-    PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
-
-    @router.get("/server_info")
-    async def show_server_info(
-        raw_request: Request,
-        config_format: Annotated[Literal["text", "json"], Query()] = "text",
-    ):
-        vllm_config: VllmConfig = raw_request.app.state.vllm_config
-        server_info = {
-            "vllm_config": str(vllm_config)
-            if config_format == "text"
-            else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
-            # fallback=str is needed to handle e.g. torch.dtype
-        }
-        return JSONResponse(content=server_info)
-
-    @router.post("/reset_prefix_cache")
-    async def reset_prefix_cache(
-        raw_request: Request,
-        reset_running_requests: bool = Query(default=False),
-        reset_external: bool = Query(default=False),
-    ):
-        """
-        Reset the local prefix cache.
-
-        Optionally, if the query parameter `reset_external=true`
-        also resets the external (connector-managed) prefix cache.
-
-        Note that we currently do not check if the prefix cache
-        is successfully reset in the API server.
-
-        Example:
-            POST /reset_prefix_cache?reset_external=true
-        """
-        logger.info("Resetting prefix cache...")
-
-        await engine_client(raw_request).reset_prefix_cache(
-            reset_running_requests, reset_external
-        )
-        return Response(status_code=200)
-
-    @router.post("/reset_mm_cache")
-    async def reset_mm_cache(raw_request: Request):
-        """
-        Reset the multi-modal cache. Note that we currently do not check if the
-        multi-modal cache is successfully reset in the API server.
-        """
-        logger.info("Resetting multi-modal cache...")
-        await engine_client(raw_request).reset_mm_cache()
-        return Response(status_code=200)
-
-    @router.post("/collective_rpc")
-    async def collective_rpc(raw_request: Request):
-        try:
-            body = await raw_request.json()
-        except json.JSONDecodeError as e:
-            raise HTTPException(
-                status_code=HTTPStatus.BAD_REQUEST.value,
-                detail=f"JSON decode error: {e}",
-            ) from e
-        method = body.get("method")
-        if method is None:
-            raise HTTPException(
-                status_code=HTTPStatus.BAD_REQUEST.value,
-                detail="Missing 'method' in request body",
-            )
-        # For security reason, only serialized string args/kwargs are passed.
-        # User-defined `method` is responsible for deserialization if needed.
-        args: list[str] = body.get("args", [])
-        kwargs: dict[str, str] = body.get("kwargs", {})
-        timeout: float | None = body.get("timeout")
-        results = await engine_client(raw_request).collective_rpc(
-            method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
-        )
-        if results is None:
-            return Response(status_code=200)
-        response: list[Any] = []
-        for result in results:
-            if result is None or isinstance(result, dict | list):
-                response.append(result)
-            else:
-                response.append(str(result))
-        return JSONResponse(content={"results": response})
-
-
 def load_log_config(log_config_file: str | None) -> dict | None:
    if not log_config_file:
        return None
--- a/vllm/entrypoints/serve/init.py
+++ b/vllm/entrypoints/serve/init.py
@ -4,8 +4,19 @@

 from fastapi import FastAPI

+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+

 def register_vllm_serve_api_routers(app: FastAPI):
+    if envs.VLLM_SERVER_DEV_MODE:
+        logger.warning(
+            "SECURITY WARNING: Development endpoints are enabled! "
+            "This should NOT be used in production!"
+        )
+
    from vllm.entrypoints.serve.lora.api_router import (
        attach_router as attach_lora_router,
    )
@ -29,6 +40,18 @@ def register_vllm_serve_api_routers(app: FastAPI):

    attach_sleep_router(app)

+    from vllm.entrypoints.serve.rpc.api_router import (
+        attach_router as attach_rpc_router,
+    )
+
+    attach_rpc_router(app)
+
+    from vllm.entrypoints.serve.cache.api_router import (
+        attach_router as attach_cache_router,
+    )
+
+    attach_cache_router(app)
+
    from vllm.entrypoints.serve.tokenize.api_router import (
        attach_router as attach_tokenize_router,
    )
@ -58,3 +81,9 @@ def register_vllm_serve_api_routers(app: FastAPI):
    )

    attach_health_router(app)
+
+    from vllm.entrypoints.serve.instrumentator.server_info import (
+        attach_router as attach_server_info_router,
+    )
+
+    attach_server_info_router(app)
--- a/vllm/entrypoints/serve/cache/init.py
+++ b/vllm/entrypoints/serve/cache/init.py
--- a/vllm/entrypoints/serve/cache/api_router.py
+++ b/vllm/entrypoints/serve/cache/api_router.py
@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from fastapi import APIRouter, FastAPI, Query, Request
+from fastapi.responses import Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.post("/reset_prefix_cache")
+async def reset_prefix_cache(
+    raw_request: Request,
+    reset_running_requests: bool = Query(default=False),
+    reset_external: bool = Query(default=False),
+):
+    """
+    Reset the local prefix cache.
+
+    Optionally, if the query parameter `reset_external=true`
+    also resets the external (connector-managed) prefix cache.
+
+    Note that we currently do not check if the prefix cache
+    is successfully reset in the API server.
+
+    Example:
+       POST /reset_prefix_cache?reset_external=true
+    """
+    logger.info("Resetting prefix cache...")
+
+    await engine_client(raw_request).reset_prefix_cache(
+        reset_running_requests, reset_external
+    )
+    return Response(status_code=200)
+
+
+@router.post("/reset_mm_cache")
+async def reset_mm_cache(raw_request: Request):
+    """
+    Reset the multi-modal cache. Note that we currently do not check if the
+    multi-modal cache is successfully reset in the API server.
+    """
+    logger.info("Resetting multi-modal cache...")
+    await engine_client(raw_request).reset_mm_cache()
+    return Response(status_code=200)
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
--- a/vllm/entrypoints/serve/instrumentator/server_info.py
+++ b/vllm/entrypoints/serve/instrumentator/server_info.py
@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+from typing import Annotated, Literal
+
+import pydantic
+from fastapi import APIRouter, FastAPI, Query, Request
+from fastapi.responses import JSONResponse
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+router = APIRouter()
+PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
+
+
+@router.get("/server_info")
+async def show_server_info(
+    raw_request: Request,
+    config_format: Annotated[Literal["text", "json"], Query()] = "text",
+):
+    vllm_config: VllmConfig = raw_request.app.state.vllm_config
+    server_info = {
+        "vllm_config": str(vllm_config)
+        if config_format == "text"
+        else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
+        # fallback=str is needed to handle e.g. torch.dtype
+    }
+    return JSONResponse(content=server_info)
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
--- a/vllm/entrypoints/serve/rpc/init.py
+++ b/vllm/entrypoints/serve/rpc/init.py
--- a/vllm/entrypoints/serve/rpc/api_router.py
+++ b/vllm/entrypoints/serve/rpc/api_router.py
@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+from http import HTTPStatus
+from typing import Any
+
+from fastapi import APIRouter, FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, Response
+
+import vllm.envs as envs
+from vllm.engine.protocol import EngineClient
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+router = APIRouter()
+
+
+def engine_client(request: Request) -> EngineClient:
+    return request.app.state.engine_client
+
+
+@router.post("/collective_rpc")
+async def collective_rpc(raw_request: Request):
+    try:
+        body = await raw_request.json()
+    except json.JSONDecodeError as e:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST.value,
+            detail=f"JSON decode error: {e}",
+        ) from e
+    method = body.get("method")
+    if method is None:
+        raise HTTPException(
+            status_code=HTTPStatus.BAD_REQUEST.value,
+            detail="Missing 'method' in request body",
+        )
+    # For security reason, only serialized string args/kwargs are passed.
+    # User-defined `method` is responsible for deserialization if needed.
+    args: list[str] = body.get("args", [])
+    kwargs: dict[str, str] = body.get("kwargs", {})
+    timeout: float | None = body.get("timeout")
+    results = await engine_client(raw_request).collective_rpc(
+        method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
+    )
+    if results is None:
+        return Response(status_code=200)
+    response: list[Any] = []
+    for result in results:
+        if result is None or isinstance(result, dict | list):
+            response.append(result)
+        else:
+            response.append(str(result))
+    return JSONResponse(content={"results": response})
+
+
+def attach_router(app: FastAPI):
+    if not envs.VLLM_SERVER_DEV_MODE:
+        return
+    app.include_router(router)
--- a/vllm/entrypoints/serve/sleep/api_router.py
+++ b/vllm/entrypoints/serve/sleep/api_router.py
@ -52,9 +52,5 @@ async def is_sleeping(raw_request: Request):
 def attach_router(app: FastAPI):
    if not envs.VLLM_SERVER_DEV_MODE:
        return
-    logger.warning(
-        "SECURITY WARNING: Development endpoints are enabled! "
-        "This should NOT be used in production!"
-    )

    app.include_router(router)