From 9ad5b2171002522772de0a0cc71b747068ec8862 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Wed, 17 Dec 2025 18:27:30 +0800 Subject: [PATCH] [Refactor] [4/N] Move VLLM_SERVER_DEV endpoints into the serve directory (#30749) Signed-off-by: chaunceyjiang --- .../scripts/hardware_ci/run-amd-test.sh | 1 - .buildkite/test-amd.yaml | 37 ++++--- .buildkite/test-pipeline.yaml | 34 ++++--- .buildkite/test_areas/entrypoints.yaml | 23 ++++- .buildkite/test_areas/tool_use.yaml | 13 --- tests/entrypoints/instrumentator/__init__.py | 0 .../test_metrics.py | 5 +- tests/entrypoints/rpc/__init__.py | 0 .../{openai => rpc}/test_collective_rpc.py | 2 +- tests/entrypoints/sleep/__init__.py | 0 .../{openai => sleep}/test_sleep.py | 2 +- vllm/entrypoints/openai/api_server.py | 98 +------------------ vllm/entrypoints/serve/__init__.py | 29 ++++++ vllm/entrypoints/serve/cache/__init__.py | 0 vllm/entrypoints/serve/cache/api_router.py | 61 ++++++++++++ .../serve/instrumentator/server_info.py | 40 ++++++++ vllm/entrypoints/serve/rpc/__init__.py | 0 vllm/entrypoints/serve/rpc/api_router.py | 61 ++++++++++++ vllm/entrypoints/serve/sleep/api_router.py | 4 - 19 files changed, 259 insertions(+), 151 deletions(-) delete mode 100644 .buildkite/test_areas/tool_use.yaml create mode 100644 tests/entrypoints/instrumentator/__init__.py rename tests/entrypoints/{openai => instrumentator}/test_metrics.py (99%) create mode 100644 tests/entrypoints/rpc/__init__.py rename tests/entrypoints/{openai => rpc}/test_collective_rpc.py (96%) create mode 100644 tests/entrypoints/sleep/__init__.py rename tests/entrypoints/{openai => sleep}/test_sleep.py (98%) create mode 100644 vllm/entrypoints/serve/cache/__init__.py create mode 100644 vllm/entrypoints/serve/cache/api_router.py create mode 100644 vllm/entrypoints/serve/instrumentator/server_info.py create mode 100644 vllm/entrypoints/serve/rpc/__init__.py create mode 100644 vllm/entrypoints/serve/rpc/api_router.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 864eb470bb0a7..08da34d81d117 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then --ignore=entrypoints/openai/test_audio.py \ --ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/test_completion.py \ - --ignore=entrypoints/openai/test_sleep.py \ --ignore=entrypoints/openai/test_models.py \ --ignore=entrypoints/openai/test_lora_adapters.py \ --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 3c9b8cbedcf06..e8f99100a8de0 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -128,7 +128,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -148,7 +148,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration Test (API Server) # 100min +- label: Entrypoints Integration Test (API Server 1) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 @@ -162,10 +162,28 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/sleep + - tests/entrypoints/rpc + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/sleep + - pytest -v -s tool_use + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] @@ -751,17 +769,6 @@ steps: # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - pytest -s entrypoints/openai/correctness/ -- label: OpenAI-Compatible Tool Use # 23 min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s tool_use ##### models test ##### diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8e6d32f71f220..b4de630b09417 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -114,7 +114,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -132,7 +132,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration Test (API Server) # 100min +- label: Entrypoints Integration Test (API Server 1) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" @@ -144,10 +144,26 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/sleep + - tests/entrypoints/rpc + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/sleep + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s tool_use + - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] @@ -666,16 +682,6 @@ steps: commands: # LMEval+Transcription WER check - pytest -s entrypoints/openai/correctness/ -- label: OpenAI-Compatible Tool Use # 23 min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s tool_use - ##### models test ##### - label: Basic Models Tests (Initialization) diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 0a789be943f37..5b16ea9c1ad07 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -10,7 +10,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration (LLM) timeout_in_minutes: 40 @@ -25,7 +25,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration (API Server) +- label: Entrypoints Integration (API Server 1) timeout_in_minutes: 130 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -34,11 +34,26 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration (API Server 2) + timeout_in_minutes: 130 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/tool_use + - tests/entrypoints/sleep + - tests/entrypoints/instrumentator + - tests/entrypoints/rpc + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s entrypoints/instrumentator + - pytest -v -s entrypoints/sleep + - pytest -v -s tool_use + - label: Entrypoints Integration (Pooling) timeout_in_minutes: 50 working_dir: "/vllm-workspace/tests" diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml deleted file mode 100644 index 69527a1214229..0000000000000 --- a/.buildkite/test_areas/tool_use.yaml +++ /dev/null @@ -1,13 +0,0 @@ -group: Tool use -depends_on: - - image-build -steps: -- label: OpenAI-Compatible Tool Use - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s tool_use diff --git a/tests/entrypoints/instrumentator/__init__.py b/tests/entrypoints/instrumentator/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/instrumentator/test_metrics.py similarity index 99% rename from tests/entrypoints/openai/test_metrics.py rename to tests/entrypoints/instrumentator/test_metrics.py index 65a6fd20bd0d1..9f2ad105a380b 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/instrumentator/test_metrics.py @@ -14,11 +14,10 @@ import requests from prometheus_client.parser import text_string_to_metric_families from transformers import AutoTokenizer +from tests.conftest import LocalAssetServer +from tests.utils import RemoteOpenAIServer from vllm import version -from ...conftest import LocalAssetServer -from ...utils import RemoteOpenAIServer - MODELS = { "text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct", diff --git a/tests/entrypoints/rpc/__init__.py b/tests/entrypoints/rpc/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_collective_rpc.py b/tests/entrypoints/rpc/test_collective_rpc.py similarity index 96% rename from tests/entrypoints/openai/test_collective_rpc.py rename to tests/entrypoints/rpc/test_collective_rpc.py index cbd6b02f05dce..56d93a427315f 100644 --- a/tests/entrypoints/openai/test_collective_rpc.py +++ b/tests/entrypoints/rpc/test_collective_rpc.py @@ -37,7 +37,7 @@ def server(): "--max-num-seqs", "128", "--worker-extension-cls", - "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension", + "tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension", ] with RemoteOpenAIServer( MODEL_NAME, diff --git a/tests/entrypoints/sleep/__init__.py b/tests/entrypoints/sleep/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/sleep/test_sleep.py similarity index 98% rename from tests/entrypoints/openai/test_sleep.py rename to tests/entrypoints/sleep/test_sleep.py index 5f94ac6da2c25..260dcd00bae91 100644 --- a/tests/entrypoints/openai/test_sleep.py +++ b/tests/entrypoints/sleep/test_sleep.py @@ -4,7 +4,7 @@ import requests from prometheus_client.parser import text_string_to_metric_families -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer MODEL_NAME = "meta-llama/Llama-3.2-1B" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5d0eacae34dd7..bca9571e39344 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,21 +17,20 @@ from argparse import Namespace from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from contextlib import asynccontextmanager from http import HTTPStatus -from typing import Annotated, Any, Literal +from typing import Annotated, Any import model_hosting_container_standards.sagemaker as sagemaker_standards import pydantic import uvloop -from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request +from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, Response, StreamingResponse +from fastapi.responses import JSONResponse, StreamingResponse from starlette.concurrency import iterate_in_threadpool from starlette.datastructures import URL, Headers, MutableHeaders, State from starlette.types import ASGIApp, Message, Receive, Scope, Send import vllm.envs as envs -from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.anthropic.protocol import ( @@ -639,97 +638,6 @@ async def create_translations( return StreamingResponse(content=generator, media_type="text/event-stream") -if envs.VLLM_SERVER_DEV_MODE: - logger.warning( - "SECURITY WARNING: Development endpoints are enabled! " - "This should NOT be used in production!" - ) - - PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig) - - @router.get("/server_info") - async def show_server_info( - raw_request: Request, - config_format: Annotated[Literal["text", "json"], Query()] = "text", - ): - vllm_config: VllmConfig = raw_request.app.state.vllm_config - server_info = { - "vllm_config": str(vllm_config) - if config_format == "text" - else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str) - # fallback=str is needed to handle e.g. torch.dtype - } - return JSONResponse(content=server_info) - - @router.post("/reset_prefix_cache") - async def reset_prefix_cache( - raw_request: Request, - reset_running_requests: bool = Query(default=False), - reset_external: bool = Query(default=False), - ): - """ - Reset the local prefix cache. - - Optionally, if the query parameter `reset_external=true` - also resets the external (connector-managed) prefix cache. - - Note that we currently do not check if the prefix cache - is successfully reset in the API server. - - Example: - POST /reset_prefix_cache?reset_external=true - """ - logger.info("Resetting prefix cache...") - - await engine_client(raw_request).reset_prefix_cache( - reset_running_requests, reset_external - ) - return Response(status_code=200) - - @router.post("/reset_mm_cache") - async def reset_mm_cache(raw_request: Request): - """ - Reset the multi-modal cache. Note that we currently do not check if the - multi-modal cache is successfully reset in the API server. - """ - logger.info("Resetting multi-modal cache...") - await engine_client(raw_request).reset_mm_cache() - return Response(status_code=200) - - @router.post("/collective_rpc") - async def collective_rpc(raw_request: Request): - try: - body = await raw_request.json() - except json.JSONDecodeError as e: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, - detail=f"JSON decode error: {e}", - ) from e - method = body.get("method") - if method is None: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, - detail="Missing 'method' in request body", - ) - # For security reason, only serialized string args/kwargs are passed. - # User-defined `method` is responsible for deserialization if needed. - args: list[str] = body.get("args", []) - kwargs: dict[str, str] = body.get("kwargs", {}) - timeout: float | None = body.get("timeout") - results = await engine_client(raw_request).collective_rpc( - method=method, timeout=timeout, args=tuple(args), kwargs=kwargs - ) - if results is None: - return Response(status_code=200) - response: list[Any] = [] - for result in results: - if result is None or isinstance(result, dict | list): - response.append(result) - else: - response.append(str(result)) - return JSONResponse(content={"results": response}) - - def load_log_config(log_config_file: str | None) -> dict | None: if not log_config_file: return None diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py index c4fcc92db931f..260fd44a02ccb 100644 --- a/vllm/entrypoints/serve/__init__.py +++ b/vllm/entrypoints/serve/__init__.py @@ -4,8 +4,19 @@ from fastapi import FastAPI +import vllm.envs as envs +from vllm.logger import init_logger + +logger = init_logger(__name__) + def register_vllm_serve_api_routers(app: FastAPI): + if envs.VLLM_SERVER_DEV_MODE: + logger.warning( + "SECURITY WARNING: Development endpoints are enabled! " + "This should NOT be used in production!" + ) + from vllm.entrypoints.serve.lora.api_router import ( attach_router as attach_lora_router, ) @@ -29,6 +40,18 @@ def register_vllm_serve_api_routers(app: FastAPI): attach_sleep_router(app) + from vllm.entrypoints.serve.rpc.api_router import ( + attach_router as attach_rpc_router, + ) + + attach_rpc_router(app) + + from vllm.entrypoints.serve.cache.api_router import ( + attach_router as attach_cache_router, + ) + + attach_cache_router(app) + from vllm.entrypoints.serve.tokenize.api_router import ( attach_router as attach_tokenize_router, ) @@ -58,3 +81,9 @@ def register_vllm_serve_api_routers(app: FastAPI): ) attach_health_router(app) + + from vllm.entrypoints.serve.instrumentator.server_info import ( + attach_router as attach_server_info_router, + ) + + attach_server_info_router(app) diff --git a/vllm/entrypoints/serve/cache/__init__.py b/vllm/entrypoints/serve/cache/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/cache/api_router.py b/vllm/entrypoints/serve/cache/api_router.py new file mode 100644 index 0000000000000..d659895463273 --- /dev/null +++ b/vllm/entrypoints/serve/cache/api_router.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from fastapi import APIRouter, FastAPI, Query, Request +from fastapi.responses import Response + +import vllm.envs as envs +from vllm.engine.protocol import EngineClient +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +@router.post("/reset_prefix_cache") +async def reset_prefix_cache( + raw_request: Request, + reset_running_requests: bool = Query(default=False), + reset_external: bool = Query(default=False), +): + """ + Reset the local prefix cache. + + Optionally, if the query parameter `reset_external=true` + also resets the external (connector-managed) prefix cache. + + Note that we currently do not check if the prefix cache + is successfully reset in the API server. + + Example: + POST /reset_prefix_cache?reset_external=true + """ + logger.info("Resetting prefix cache...") + + await engine_client(raw_request).reset_prefix_cache( + reset_running_requests, reset_external + ) + return Response(status_code=200) + + +@router.post("/reset_mm_cache") +async def reset_mm_cache(raw_request: Request): + """ + Reset the multi-modal cache. Note that we currently do not check if the + multi-modal cache is successfully reset in the API server. + """ + logger.info("Resetting multi-modal cache...") + await engine_client(raw_request).reset_mm_cache() + return Response(status_code=200) + + +def attach_router(app: FastAPI): + if not envs.VLLM_SERVER_DEV_MODE: + return + app.include_router(router) diff --git a/vllm/entrypoints/serve/instrumentator/server_info.py b/vllm/entrypoints/serve/instrumentator/server_info.py new file mode 100644 index 0000000000000..1a69dfacae1c2 --- /dev/null +++ b/vllm/entrypoints/serve/instrumentator/server_info.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from typing import Annotated, Literal + +import pydantic +from fastapi import APIRouter, FastAPI, Query, Request +from fastapi.responses import JSONResponse + +import vllm.envs as envs +from vllm.config import VllmConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +router = APIRouter() +PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig) + + +@router.get("/server_info") +async def show_server_info( + raw_request: Request, + config_format: Annotated[Literal["text", "json"], Query()] = "text", +): + vllm_config: VllmConfig = raw_request.app.state.vllm_config + server_info = { + "vllm_config": str(vllm_config) + if config_format == "text" + else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str) + # fallback=str is needed to handle e.g. torch.dtype + } + return JSONResponse(content=server_info) + + +def attach_router(app: FastAPI): + if not envs.VLLM_SERVER_DEV_MODE: + return + app.include_router(router) diff --git a/vllm/entrypoints/serve/rpc/__init__.py b/vllm/entrypoints/serve/rpc/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/serve/rpc/api_router.py b/vllm/entrypoints/serve/rpc/api_router.py new file mode 100644 index 0000000000000..54f582c408d54 --- /dev/null +++ b/vllm/entrypoints/serve/rpc/api_router.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from http import HTTPStatus +from typing import Any + +from fastapi import APIRouter, FastAPI, HTTPException, Request +from fastapi.responses import JSONResponse, Response + +import vllm.envs as envs +from vllm.engine.protocol import EngineClient +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def engine_client(request: Request) -> EngineClient: + return request.app.state.engine_client + + +@router.post("/collective_rpc") +async def collective_rpc(raw_request: Request): + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"JSON decode error: {e}", + ) from e + method = body.get("method") + if method is None: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail="Missing 'method' in request body", + ) + # For security reason, only serialized string args/kwargs are passed. + # User-defined `method` is responsible for deserialization if needed. + args: list[str] = body.get("args", []) + kwargs: dict[str, str] = body.get("kwargs", {}) + timeout: float | None = body.get("timeout") + results = await engine_client(raw_request).collective_rpc( + method=method, timeout=timeout, args=tuple(args), kwargs=kwargs + ) + if results is None: + return Response(status_code=200) + response: list[Any] = [] + for result in results: + if result is None or isinstance(result, dict | list): + response.append(result) + else: + response.append(str(result)) + return JSONResponse(content={"results": response}) + + +def attach_router(app: FastAPI): + if not envs.VLLM_SERVER_DEV_MODE: + return + app.include_router(router) diff --git a/vllm/entrypoints/serve/sleep/api_router.py b/vllm/entrypoints/serve/sleep/api_router.py index bc01e185315c8..c0e4c3028b2ea 100644 --- a/vllm/entrypoints/serve/sleep/api_router.py +++ b/vllm/entrypoints/serve/sleep/api_router.py @@ -52,9 +52,5 @@ async def is_sleeping(raw_request: Request): def attach_router(app: FastAPI): if not envs.VLLM_SERVER_DEV_MODE: return - logger.warning( - "SECURITY WARNING: Development endpoints are enabled! " - "This should NOT be used in production!" - ) app.include_router(router)