[Core] Expose API endpoint /is_sleeping (#14312)

Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
This commit is contained in:
Jun Duan 2025-03-15 09:28:14 -04:00 committed by GitHub
parent f58aea002c
commit 74bc397b0a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 100 additions and 4 deletions

View File

@ -28,5 +28,12 @@ def test_sleep_mode():
response = requests.post(remote_server.url_for("/sleep"),
data={"level": "1"})
assert response.status_code == 200
response = requests.get(remote_server.url_for("/is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is True
response = requests.post(remote_server.url_for("/wake_up"))
assert response.status_code == 200
response = requests.get(remote_server.url_for("/is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is False

View File

@ -1225,6 +1225,9 @@ class AsyncLLMEngine(EngineClient):
async def wake_up(self) -> None:
self.engine.wake_up()
async def is_sleeping(self) -> bool:
return self.engine.is_sleeping()
async def add_lora(self, lora_request: LoRARequest) -> None:
self.engine.add_lora(lora_request)

View File

@ -1948,6 +1948,9 @@ class LLMEngine:
"Sleep mode is not enabled in the model config")
self.model_executor.wake_up()
def is_sleeping(self) -> bool:
return self.model_executor.is_sleeping
def check_health(self) -> None:
if self.tokenizer:
self.tokenizer.check_health()

View File

@ -136,6 +136,18 @@ class RPCWakeUpRequest(Enum):
WAKE_UP = 1
@dataclass
class RPCIsSleepingRequest:
# Set the default value of request_id to a new UUID
request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
@dataclass
class RPCIsSleepingResponse:
request_id: str
is_sleeping: bool
@dataclass
class RPCLoadAdapterRequest:
lora_request: LoRARequest
@ -151,10 +163,10 @@ class RPCAdapterLoadedResponse:
RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
RPCUProfileRequest, RPCLoadAdapterRequest,
RPCResetPrefixCacheRequest, RPCSleepRequest,
RPCWakeUpRequest]
RPCWakeUpRequest, RPCIsSleepingRequest]
REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
RPCError]
RPCIsSleepingResponse, RPCError]
def ENGINE_DEAD_ERROR(

View File

@ -27,6 +27,8 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
IPC_OUTPUT_EXT, RPC_REQUEST_T,
VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
RPCAdapterLoadedResponse, RPCError,
RPCIsSleepingRequest,
RPCIsSleepingResponse,
RPCLoadAdapterRequest,
RPCProcessRequest,
RPCResetPrefixCacheRequest,
@ -246,7 +248,9 @@ class MQLLMEngineClient(EngineClient):
if queue is not None:
queue.put_nowait(exception)
# Put each output into the appropriate queue.
elif isinstance(request_outputs, RPCAdapterLoadedResponse):
elif isinstance(
request_outputs,
(RPCAdapterLoadedResponse, RPCIsSleepingResponse)):
self._add_output(request_outputs)
else:
for request_output in request_outputs:
@ -256,7 +260,8 @@ class MQLLMEngineClient(EngineClient):
logger.debug("Shutting down MQLLMEngineClient output handler.")
def _add_output(self, request_output: Union[RequestOutput,
RPCAdapterLoadedResponse]):
RPCAdapterLoadedResponse,
RPCIsSleepingResponse]):
queue = self.output_queues.get(request_output.request_id)
if queue is not None:
queue.put_nowait(request_output)
@ -696,6 +701,24 @@ class MQLLMEngineClient(EngineClient):
return await self._send_one_way_rpc_request(
request=RPCWakeUpRequest.WAKE_UP, socket=self.input_socket)
async def is_sleeping(self) -> bool:
"""Check whether the engine is sleeping"""
request = RPCIsSleepingRequest()
queue: asyncio.Queue[Union[BaseException,
RPCIsSleepingResponse]] = asyncio.Queue()
self.output_queues[request.request_id] = queue
request_bytes = pickle.dumps(request)
await self.input_socket.send_multipart((request_bytes, ), copy=False)
request_output = await queue.get()
self.output_queues.pop(request.request_id)
if isinstance(request_output, BaseException):
raise request_output
return request_output.is_sleeping
async def add_lora(self, lora_request: LoRARequest) -> None:
"""Load a new LoRA adapter into the engine for future requests."""
# Uses the same I/O as generate requests

View File

@ -18,6 +18,8 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
RPCAdapterLoadedResponse, RPCError,
RPCIsSleepingRequest,
RPCIsSleepingResponse,
RPCLoadAdapterRequest,
RPCProcessRequest,
RPCResetPrefixCacheRequest,
@ -271,6 +273,8 @@ class MQLLMEngine:
self.sleep(request.value)
elif isinstance(request, RPCWakeUpRequest):
self.wake_up()
elif isinstance(request, RPCIsSleepingRequest):
self._handle_is_sleeping_request(request)
else:
raise ValueError("Unknown RPCRequest Type: "
f"{type(request)}")
@ -337,6 +341,12 @@ class MQLLMEngine:
self._send_outputs(
RPCAdapterLoadedResponse(request_id=request.request_id))
def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest):
is_sleeping = self.is_sleeping()
self._send_outputs(
RPCIsSleepingResponse(request_id=request.request_id,
is_sleeping=is_sleeping))
def _health_check(self):
# Send unhealthy if engine has already errored
if self._errored_with is not None:
@ -406,6 +416,9 @@ class MQLLMEngine:
def wake_up(self) -> None:
self.engine.wake_up()
def is_sleeping(self) -> bool:
return self.engine.is_sleeping()
def signal_handler(*_) -> None:
raise KeyboardInterrupt("MQLLMEngine terminated")

View File

@ -288,6 +288,11 @@ class EngineClient(ABC):
"""Wake up the engine"""
...
@abstractmethod
async def is_sleeping(self) -> bool:
"""Check whether the engine is sleeping"""
...
@abstractmethod
async def add_lora(self, lora_request: LoRARequest) -> None:
"""Load a new LoRA adapter into the engine for future requests."""

View File

@ -694,6 +694,12 @@ if envs.VLLM_SERVER_DEV_MODE:
# is sent but does not finish yet when we return a response.
return Response(status_code=200)
@router.get("/is_sleeping")
async def is_sleeping(raw_request: Request):
logger.info("check whether the engine is sleeping")
is_sleeping = await engine_client(raw_request).is_sleeping()
return JSONResponse(content={"is_sleeping": is_sleeping})
@router.post("/invocations", dependencies=[Depends(validate_json_request)])
async def invocations(raw_request: Request):

View File

@ -407,6 +407,9 @@ class AsyncLLM(EngineClient):
async def wake_up(self) -> None:
await self.engine_core.wake_up_async()
async def is_sleeping(self) -> bool:
return await self.engine_core.is_sleeping_async()
async def add_lora(self, lora_request: LoRARequest) -> bool:
"""Load a new LoRA adapter into the engine for future requests."""
return await self.engine_core.add_lora_async(lora_request)

View File

@ -253,6 +253,9 @@ class EngineCore:
def wake_up(self):
self.model_executor.wake_up()
def is_sleeping(self) -> bool:
return self.model_executor.is_sleeping
def execute_dummy_batch(self):
self.model_executor.collective_rpc("execute_dummy_batch")

View File

@ -89,6 +89,9 @@ class EngineCoreClient(ABC):
def wake_up(self) -> None:
raise NotImplementedError
def is_sleeping(self) -> bool:
raise NotImplementedError
def execute_dummy_batch(self) -> None:
raise NotImplementedError
@ -128,6 +131,9 @@ class EngineCoreClient(ABC):
async def wake_up_async(self) -> None:
raise NotImplementedError
async def is_sleeping_async(self) -> bool:
raise NotImplementedError
async def abort_requests_async(self, request_ids: list[str]) -> None:
raise NotImplementedError
@ -182,6 +188,9 @@ class InprocClient(EngineCoreClient):
def wake_up(self) -> None:
self.engine_core.wake_up()
def is_sleeping(self) -> bool:
return self.engine_core.is_sleeping()
def execute_dummy_batch(self) -> None:
self.engine_core.execute_dummy_batch()
@ -433,6 +442,9 @@ class SyncMPClient(MPClient):
def wake_up(self) -> None:
self._call_utility("wake_up")
def is_sleeping(self) -> bool:
return self._call_utility("is_sleeping")
def execute_dummy_batch(self) -> None:
self._call_utility("execute_dummy_batch")
@ -523,6 +535,9 @@ class AsyncMPClient(MPClient):
async def wake_up_async(self) -> None:
await self._call_utility_async("wake_up")
async def is_sleeping_async(self) -> bool:
return await self._call_utility_async("is_sleeping")
async def execute_dummy_batch_async(self) -> None:
await self._call_utility_async("execute_dummy_batch")

View File

@ -235,6 +235,9 @@ class LLMEngine:
def wake_up(self):
self.engine_core.wake_up()
def is_sleeping(self) -> bool:
return self.engine_core.is_sleeping()
def get_tokenizer_group(
self,
group_type: type[_G] = BaseTokenizerGroup,