[Refactor] [4/N] Move VLLM_SERVER_DEV endpoints into the serve directory (#30749)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
This commit is contained in:
Chauncey 2025-12-17 18:27:30 +08:00 committed by GitHub
parent f284d7bd0c
commit 9ad5b21710
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 259 additions and 151 deletions

View File

@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_lora_adapters.py \
--ignore=entrypoints/openai/test_return_tokens_as_ids.py \

View File

@ -128,7 +128,7 @@ steps:
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40
@ -148,7 +148,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min
- label: Entrypoints Integration Test (API Server 1) # 100min
timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
@ -162,10 +162,28 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (API Server 2)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/sleep
- tests/entrypoints/rpc
- tests/tool_use
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/sleep
- pytest -v -s tool_use
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
@ -751,17 +769,6 @@ steps:
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
- pytest -s entrypoints/openai/correctness/
- label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use
##### models test #####

View File

@ -114,7 +114,7 @@ steps:
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration Test (LLM) # 30min
timeout_in_minutes: 40
@ -132,7 +132,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min
- label: Entrypoints Integration Test (API Server 1) # 100min
timeout_in_minutes: 130
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
@ -144,10 +144,26 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (API Server 2)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
fast_check: true
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/entrypoints/sleep
- tests/entrypoints/rpc
- tests/tool_use
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/sleep
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s tool_use
- label: Entrypoints Integration Test (Pooling)
timeout_in_minutes: 50
mirror_hardwares: [amdexperimental]
@ -666,16 +682,6 @@ steps:
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
- label: OpenAI-Compatible Tool Use # 23 min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use
##### models test #####
- label: Basic Models Tests (Initialization)

View File

@ -10,7 +10,7 @@ steps:
- tests/entrypoints/
commands:
- pytest -v -s entrypoints/openai/tool_parsers
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
- label: Entrypoints Integration (LLM)
timeout_in_minutes: 40
@ -25,7 +25,7 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration (API Server)
- label: Entrypoints Integration (API Server 1)
timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
@ -34,11 +34,26 @@ steps:
- tests/entrypoints/test_chat_utils
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
- pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration (API Server 2)
timeout_in_minutes: 130
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- vllm/
- tests/tool_use
- tests/entrypoints/sleep
- tests/entrypoints/instrumentator
- tests/entrypoints/rpc
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
- pytest -v -s entrypoints/instrumentator
- pytest -v -s entrypoints/sleep
- pytest -v -s tool_use
- label: Entrypoints Integration (Pooling)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/tests"

View File

@ -1,13 +0,0 @@
group: Tool use
depends_on:
- image-build
steps:
- label: OpenAI-Compatible Tool Use
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use

View File

@ -14,11 +14,10 @@ import requests
from prometheus_client.parser import text_string_to_metric_families
from transformers import AutoTokenizer
from tests.conftest import LocalAssetServer
from tests.utils import RemoteOpenAIServer
from vllm import version
from ...conftest import LocalAssetServer
from ...utils import RemoteOpenAIServer
MODELS = {
"text": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"multimodal": "HuggingFaceTB/SmolVLM-256M-Instruct",

View File

View File

@ -37,7 +37,7 @@ def server():
"--max-num-seqs",
"128",
"--worker-extension-cls",
"tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
"tests.entrypoints.rpc.test_collective_rpc.TestWorkerExtension",
]
with RemoteOpenAIServer(
MODEL_NAME,

View File

View File

@ -4,7 +4,7 @@
import requests
from prometheus_client.parser import text_string_to_metric_families
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "meta-llama/Llama-3.2-1B"

View File

@ -17,21 +17,20 @@ from argparse import Namespace
from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
from contextlib import asynccontextmanager
from http import HTTPStatus
from typing import Annotated, Any, Literal
from typing import Annotated, Any
import model_hosting_container_standards.sagemaker as sagemaker_standards
import pydantic
import uvloop
from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Query, Request
from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse
from fastapi.responses import JSONResponse, StreamingResponse
from starlette.concurrency import iterate_in_threadpool
from starlette.datastructures import URL, Headers, MutableHeaders, State
from starlette.types import ASGIApp, Message, Receive, Scope, Send
import vllm.envs as envs
from vllm.config import VllmConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.anthropic.protocol import (
@ -639,97 +638,6 @@ async def create_translations(
return StreamingResponse(content=generator, media_type="text/event-stream")
if envs.VLLM_SERVER_DEV_MODE:
logger.warning(
"SECURITY WARNING: Development endpoints are enabled! "
"This should NOT be used in production!"
)
PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
@router.get("/server_info")
async def show_server_info(
raw_request: Request,
config_format: Annotated[Literal["text", "json"], Query()] = "text",
):
vllm_config: VllmConfig = raw_request.app.state.vllm_config
server_info = {
"vllm_config": str(vllm_config)
if config_format == "text"
else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
# fallback=str is needed to handle e.g. torch.dtype
}
return JSONResponse(content=server_info)
@router.post("/reset_prefix_cache")
async def reset_prefix_cache(
raw_request: Request,
reset_running_requests: bool = Query(default=False),
reset_external: bool = Query(default=False),
):
"""
Reset the local prefix cache.
Optionally, if the query parameter `reset_external=true`
also resets the external (connector-managed) prefix cache.
Note that we currently do not check if the prefix cache
is successfully reset in the API server.
Example:
POST /reset_prefix_cache?reset_external=true
"""
logger.info("Resetting prefix cache...")
await engine_client(raw_request).reset_prefix_cache(
reset_running_requests, reset_external
)
return Response(status_code=200)
@router.post("/reset_mm_cache")
async def reset_mm_cache(raw_request: Request):
"""
Reset the multi-modal cache. Note that we currently do not check if the
multi-modal cache is successfully reset in the API server.
"""
logger.info("Resetting multi-modal cache...")
await engine_client(raw_request).reset_mm_cache()
return Response(status_code=200)
@router.post("/collective_rpc")
async def collective_rpc(raw_request: Request):
try:
body = await raw_request.json()
except json.JSONDecodeError as e:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST.value,
detail=f"JSON decode error: {e}",
) from e
method = body.get("method")
if method is None:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST.value,
detail="Missing 'method' in request body",
)
# For security reason, only serialized string args/kwargs are passed.
# User-defined `method` is responsible for deserialization if needed.
args: list[str] = body.get("args", [])
kwargs: dict[str, str] = body.get("kwargs", {})
timeout: float | None = body.get("timeout")
results = await engine_client(raw_request).collective_rpc(
method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
)
if results is None:
return Response(status_code=200)
response: list[Any] = []
for result in results:
if result is None or isinstance(result, dict | list):
response.append(result)
else:
response.append(str(result))
return JSONResponse(content={"results": response})
def load_log_config(log_config_file: str | None) -> dict | None:
if not log_config_file:
return None

View File

@ -4,8 +4,19 @@
from fastapi import FastAPI
import vllm.envs as envs
from vllm.logger import init_logger
logger = init_logger(__name__)
def register_vllm_serve_api_routers(app: FastAPI):
if envs.VLLM_SERVER_DEV_MODE:
logger.warning(
"SECURITY WARNING: Development endpoints are enabled! "
"This should NOT be used in production!"
)
from vllm.entrypoints.serve.lora.api_router import (
attach_router as attach_lora_router,
)
@ -29,6 +40,18 @@ def register_vllm_serve_api_routers(app: FastAPI):
attach_sleep_router(app)
from vllm.entrypoints.serve.rpc.api_router import (
attach_router as attach_rpc_router,
)
attach_rpc_router(app)
from vllm.entrypoints.serve.cache.api_router import (
attach_router as attach_cache_router,
)
attach_cache_router(app)
from vllm.entrypoints.serve.tokenize.api_router import (
attach_router as attach_tokenize_router,
)
@ -58,3 +81,9 @@ def register_vllm_serve_api_routers(app: FastAPI):
)
attach_health_router(app)
from vllm.entrypoints.serve.instrumentator.server_info import (
attach_router as attach_server_info_router,
)
attach_server_info_router(app)

View File

View File

@ -0,0 +1,61 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fastapi import APIRouter, FastAPI, Query, Request
from fastapi.responses import Response
import vllm.envs as envs
from vllm.engine.protocol import EngineClient
from vllm.logger import init_logger
logger = init_logger(__name__)
router = APIRouter()
def engine_client(request: Request) -> EngineClient:
return request.app.state.engine_client
@router.post("/reset_prefix_cache")
async def reset_prefix_cache(
raw_request: Request,
reset_running_requests: bool = Query(default=False),
reset_external: bool = Query(default=False),
):
"""
Reset the local prefix cache.
Optionally, if the query parameter `reset_external=true`
also resets the external (connector-managed) prefix cache.
Note that we currently do not check if the prefix cache
is successfully reset in the API server.
Example:
POST /reset_prefix_cache?reset_external=true
"""
logger.info("Resetting prefix cache...")
await engine_client(raw_request).reset_prefix_cache(
reset_running_requests, reset_external
)
return Response(status_code=200)
@router.post("/reset_mm_cache")
async def reset_mm_cache(raw_request: Request):
"""
Reset the multi-modal cache. Note that we currently do not check if the
multi-modal cache is successfully reset in the API server.
"""
logger.info("Resetting multi-modal cache...")
await engine_client(raw_request).reset_mm_cache()
return Response(status_code=200)
def attach_router(app: FastAPI):
if not envs.VLLM_SERVER_DEV_MODE:
return
app.include_router(router)

View File

@ -0,0 +1,40 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Annotated, Literal
import pydantic
from fastapi import APIRouter, FastAPI, Query, Request
from fastapi.responses import JSONResponse
import vllm.envs as envs
from vllm.config import VllmConfig
from vllm.logger import init_logger
logger = init_logger(__name__)
router = APIRouter()
PydanticVllmConfig = pydantic.TypeAdapter(VllmConfig)
@router.get("/server_info")
async def show_server_info(
raw_request: Request,
config_format: Annotated[Literal["text", "json"], Query()] = "text",
):
vllm_config: VllmConfig = raw_request.app.state.vllm_config
server_info = {
"vllm_config": str(vllm_config)
if config_format == "text"
else PydanticVllmConfig.dump_python(vllm_config, mode="json", fallback=str)
# fallback=str is needed to handle e.g. torch.dtype
}
return JSONResponse(content=server_info)
def attach_router(app: FastAPI):
if not envs.VLLM_SERVER_DEV_MODE:
return
app.include_router(router)

View File

View File

@ -0,0 +1,61 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from http import HTTPStatus
from typing import Any
from fastapi import APIRouter, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, Response
import vllm.envs as envs
from vllm.engine.protocol import EngineClient
from vllm.logger import init_logger
logger = init_logger(__name__)
router = APIRouter()
def engine_client(request: Request) -> EngineClient:
return request.app.state.engine_client
@router.post("/collective_rpc")
async def collective_rpc(raw_request: Request):
try:
body = await raw_request.json()
except json.JSONDecodeError as e:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST.value,
detail=f"JSON decode error: {e}",
) from e
method = body.get("method")
if method is None:
raise HTTPException(
status_code=HTTPStatus.BAD_REQUEST.value,
detail="Missing 'method' in request body",
)
# For security reason, only serialized string args/kwargs are passed.
# User-defined `method` is responsible for deserialization if needed.
args: list[str] = body.get("args", [])
kwargs: dict[str, str] = body.get("kwargs", {})
timeout: float | None = body.get("timeout")
results = await engine_client(raw_request).collective_rpc(
method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
)
if results is None:
return Response(status_code=200)
response: list[Any] = []
for result in results:
if result is None or isinstance(result, dict | list):
response.append(result)
else:
response.append(str(result))
return JSONResponse(content={"results": response})
def attach_router(app: FastAPI):
if not envs.VLLM_SERVER_DEV_MODE:
return
app.include_router(router)

View File

@ -52,9 +52,5 @@ async def is_sleeping(raw_request: Request):
def attach_router(app: FastAPI):
if not envs.VLLM_SERVER_DEV_MODE:
return
logger.warning(
"SECURITY WARNING: Development endpoints are enabled! "
"This should NOT be used in production!"
)
app.include_router(router)