From 1666e664435a0b918b7186a020bcc59b8b3216c0 Mon Sep 17 00:00:00 2001
From: Xihui Cang <754795299@qq.com>
Date: Tue, 15 Apr 2025 19:50:38 +0800
Subject: [PATCH] =?UTF-8?q?Add=20"/server=5Finfo"=20endpoint=20in=20api=5F?=
 =?UTF-8?q?server=20to=20retrieve=20the=20vllm=5Fconfig.=C2=A0=20(#16572)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Xihui Cang <xihuicang@gmail.com>
---
 vllm/engine/async_llm_engine.py       |  4 ++++
 vllm/engine/llm_engine.py             |  4 ++++
 vllm/engine/multiprocessing/client.py |  4 ++++
 vllm/engine/protocol.py               |  7 ++++++-
 vllm/entrypoints/openai/api_server.py | 16 ++++++++++++----
 vllm/v1/engine/async_llm.py           |  5 ++++-
 vllm/v1/engine/llm_engine.py          |  3 +++
 7 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 7f9f85e1f93f2..67c7e109c9f04 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1167,6 +1167,10 @@ class AsyncLLMEngine(EngineClient):
                                             exception=asyncio.CancelledError,
                                             verbose=self.log_requests)
 
+    async def get_vllm_config(self) -> VllmConfig:
+        """Get the vllm configuration of the vLLM engine."""
+        return self.engine.get_vllm_config()
+
     async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
         return self.engine.get_model_config()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 54f7b8fb69b58..2347cdee904b3 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -914,6 +914,10 @@ class LLMEngine:
             scheduler.abort_seq_group(
                 request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
 
+    def get_vllm_config(self) -> VllmConfig:
+        """Gets the vllm configuration."""
+        return self.vllm_config
+
     def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
         return self.model_config
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index f058b13297bb0..6e56cbdbbf8c7 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -93,6 +93,7 @@ class MQLLMEngineClient(EngineClient):
         self._errored_with: Optional[BaseException] = None
 
         # Get the configs.
+        self.vllm_config = engine_config
         self.model_config = engine_config.model_config
         self.decoding_config = engine_config.decoding_config
 
@@ -377,6 +378,9 @@ class MQLLMEngineClient(EngineClient):
     async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
         return await self.tokenizer.get_lora_tokenizer_async(lora_request)
 
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
     async def get_decoding_config(self) -> DecodingConfig:
         return self.decoding_config
 
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index e2974b02c5ba3..7e5ac3a284522 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 from typing import AsyncGenerator, List, Mapping, Optional
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
-from vllm.config import DecodingConfig, ModelConfig
+from vllm.config import DecodingConfig, ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
@@ -220,6 +220,11 @@ class EngineClient(ABC):
         """
         ...
 
+    @abstractmethod
+    async def get_vllm_config(self) -> VllmConfig:
+        """Get the vllm configuration of the vLLM engine."""
+        ...
+
     @abstractmethod
     async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6a8bdd0602285..2c15aa8a9335a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -30,7 +30,7 @@ from starlette.routing import Mount
 from typing_extensions import assert_never
 
 import vllm.envs as envs
-from vllm.config import ModelConfig
+from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
@@ -327,6 +327,7 @@ def mount_metrics(app: FastAPI):
                 "/load",
                 "/ping",
                 "/version",
+                "/server_info",
             ],
             registry=registry,
         ).add().instrument(app).expose(app)
@@ -687,6 +688,11 @@ TASK_HANDLERS: dict[str, dict[str, tuple]] = {
 
 if envs.VLLM_SERVER_DEV_MODE:
 
+    @router.get("/server_info")
+    async def show_server_info(raw_request: Request):
+        server_info = {"vllm_config": str(raw_request.app.state.vllm_config)}
+        return JSONResponse(content=server_info)
+
     @router.post("/reset_prefix_cache")
     async def reset_prefix_cache(raw_request: Request):
         """
@@ -894,7 +900,7 @@ def build_app(args: Namespace) -> FastAPI:
 
 async def init_app_state(
     engine_client: EngineClient,
-    model_config: ModelConfig,
+    vllm_config: VllmConfig,
     state: State,
     args: Namespace,
 ) -> None:
@@ -915,6 +921,8 @@ async def init_app_state(
 
     state.engine_client = engine_client
     state.log_stats = not args.disable_log_stats
+    state.vllm_config = vllm_config
+    model_config = vllm_config.model_config
 
     resolved_chat_template = load_chat_template(args.chat_template)
     if resolved_chat_template is not None:
@@ -1069,8 +1077,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     async with build_async_engine_client(args) as engine_client:
         app = build_app(args)
 
-        model_config = await engine_client.get_model_config()
-        await init_app_state(engine_client, model_config, app.state, args)
+        vllm_config = await engine_client.get_vllm_config()
+        await init_app_state(engine_client, vllm_config, app.state, args)
 
         def _listen_addr(a: str) -> str:
             if is_valid_ipv6_address(a):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index b77a6824cddbd..6d24ba2bc9820 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -64,7 +64,7 @@ class AsyncLLM(EngineClient):
         assert start_engine_loop
 
         self.model_config = vllm_config.model_config
-
+        self.vllm_config = vllm_config
         self.log_requests = log_requests
         self.log_stats = log_stats
 
@@ -379,6 +379,9 @@ class AsyncLLM(EngineClient):
     ):
         raise ValueError("Not Supported on V1 yet.")
 
+    async def get_vllm_config(self) -> VllmConfig:
+        return self.vllm_config
+
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 4c67186f70401..c05319f3d80c6 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -230,6 +230,9 @@ class LLMEngine:
 
         return processed_outputs.request_outputs
 
+    def get_vllm_config(self):
+        return self.vllm_config
+
     def get_model_config(self):
         return self.model_config