From 1666e664435a0b918b7186a020bcc59b8b3216c0 Mon Sep 17 00:00:00 2001 From: Xihui Cang <754795299@qq.com> Date: Tue, 15 Apr 2025 19:50:38 +0800 Subject: [PATCH] =?UTF-8?q?Add=20"/server=5Finfo"=20endpoint=20in=20api=5F?= =?UTF-8?q?server=20to=20retrieve=20the=20vllm=5Fconfig.=C2=A0=20(#16572)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Xihui Cang --- vllm/engine/async_llm_engine.py | 4 ++++ vllm/engine/llm_engine.py | 4 ++++ vllm/engine/multiprocessing/client.py | 4 ++++ vllm/engine/protocol.py | 7 ++++++- vllm/entrypoints/openai/api_server.py | 16 ++++++++++++---- vllm/v1/engine/async_llm.py | 5 ++++- vllm/v1/engine/llm_engine.py | 3 +++ 7 files changed, 37 insertions(+), 6 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 7f9f85e1f93f2..67c7e109c9f04 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1167,6 +1167,10 @@ class AsyncLLMEngine(EngineClient): exception=asyncio.CancelledError, verbose=self.log_requests) + async def get_vllm_config(self) -> VllmConfig: + """Get the vllm configuration of the vLLM engine.""" + return self.engine.get_vllm_config() + async def get_model_config(self) -> ModelConfig: """Get the model configuration of the vLLM engine.""" return self.engine.get_model_config() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 54f7b8fb69b58..2347cdee904b3 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -914,6 +914,10 @@ class LLMEngine: scheduler.abort_seq_group( request_id, seq_id_to_seq_group=self.seq_id_to_seq_group) + def get_vllm_config(self) -> VllmConfig: + """Gets the vllm configuration.""" + return self.vllm_config + def get_model_config(self) -> ModelConfig: """Gets the model configuration.""" return self.model_config diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index f058b13297bb0..6e56cbdbbf8c7 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -93,6 +93,7 @@ class MQLLMEngineClient(EngineClient): self._errored_with: Optional[BaseException] = None # Get the configs. + self.vllm_config = engine_config self.model_config = engine_config.model_config self.decoding_config = engine_config.decoding_config @@ -377,6 +378,9 @@ class MQLLMEngineClient(EngineClient): async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): return await self.tokenizer.get_lora_tokenizer_async(lora_request) + async def get_vllm_config(self) -> VllmConfig: + return self.vllm_config + async def get_decoding_config(self) -> DecodingConfig: return self.decoding_config diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index e2974b02c5ba3..7e5ac3a284522 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from typing import AsyncGenerator, List, Mapping, Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function -from vllm.config import DecodingConfig, ModelConfig +from vllm.config import DecodingConfig, ModelConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import is_explicit_encoder_decoder_prompt @@ -220,6 +220,11 @@ class EngineClient(ABC): """ ... + @abstractmethod + async def get_vllm_config(self) -> VllmConfig: + """Get the vllm configuration of the vLLM engine.""" + ... + @abstractmethod async def get_model_config(self) -> ModelConfig: """Get the model configuration of the vLLM engine.""" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6a8bdd0602285..2c15aa8a9335a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -30,7 +30,7 @@ from starlette.routing import Mount from typing_extensions import assert_never import vllm.envs as envs -from vllm.config import ModelConfig +from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore from vllm.engine.multiprocessing.client import MQLLMEngineClient @@ -327,6 +327,7 @@ def mount_metrics(app: FastAPI): "/load", "/ping", "/version", + "/server_info", ], registry=registry, ).add().instrument(app).expose(app) @@ -687,6 +688,11 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = { if envs.VLLM_SERVER_DEV_MODE: + @router.get("/server_info") + async def show_server_info(raw_request: Request): + server_info = {"vllm_config": str(raw_request.app.state.vllm_config)} + return JSONResponse(content=server_info) + @router.post("/reset_prefix_cache") async def reset_prefix_cache(raw_request: Request): """ @@ -894,7 +900,7 @@ def build_app(args: Namespace) -> FastAPI: async def init_app_state( engine_client: EngineClient, - model_config: ModelConfig, + vllm_config: VllmConfig, state: State, args: Namespace, ) -> None: @@ -915,6 +921,8 @@ async def init_app_state( state.engine_client = engine_client state.log_stats = not args.disable_log_stats + state.vllm_config = vllm_config + model_config = vllm_config.model_config resolved_chat_template = load_chat_template(args.chat_template) if resolved_chat_template is not None: @@ -1069,8 +1077,8 @@ async def run_server(args, **uvicorn_kwargs) -> None: async with build_async_engine_client(args) as engine_client: app = build_app(args) - model_config = await engine_client.get_model_config() - await init_app_state(engine_client, model_config, app.state, args) + vllm_config = await engine_client.get_vllm_config() + await init_app_state(engine_client, vllm_config, app.state, args) def _listen_addr(a: str) -> str: if is_valid_ipv6_address(a): diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b77a6824cddbd..6d24ba2bc9820 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -64,7 +64,7 @@ class AsyncLLM(EngineClient): assert start_engine_loop self.model_config = vllm_config.model_config - + self.vllm_config = vllm_config self.log_requests = log_requests self.log_stats = log_stats @@ -379,6 +379,9 @@ class AsyncLLM(EngineClient): ): raise ValueError("Not Supported on V1 yet.") + async def get_vllm_config(self) -> VllmConfig: + return self.vllm_config + async def get_model_config(self) -> ModelConfig: return self.model_config diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4c67186f70401..c05319f3d80c6 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -230,6 +230,9 @@ class LLMEngine: return processed_outputs.request_outputs + def get_vllm_config(self): + return self.vllm_config + def get_model_config(self): return self.model_config