diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py index e07436f89d2d2..5f94ac6da2c25 100644 --- a/tests/entrypoints/openai/test_sleep.py +++ b/tests/entrypoints/openai/test_sleep.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import requests +from prometheus_client.parser import text_string_to_metric_families from ...utils import RemoteOpenAIServer @@ -31,12 +32,28 @@ def test_sleep_mode(): assert response.status_code == 200 assert response.json().get("is_sleeping") is True + # check sleep metrics + response = requests.get(remote_server.url_for("metrics")) + assert response.status_code == 200 + awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response) + assert awake == 0 + assert weights_offloaded == 1 + assert discard_all == 0 + response = requests.post(remote_server.url_for("wake_up")) assert response.status_code == 200 response = requests.get(remote_server.url_for("is_sleeping")) assert response.status_code == 200 assert response.json().get("is_sleeping") is False + # check sleep metrics + response = requests.get(remote_server.url_for("metrics")) + assert response.status_code == 200 + awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response) + assert awake == 1 + assert weights_offloaded == 0 + assert discard_all == 0 + # test wake up with tags response = requests.post(remote_server.url_for("sleep"), params={"level": "1"}) assert response.status_code == 200 @@ -59,3 +76,35 @@ def test_sleep_mode(): response = requests.get(remote_server.url_for("is_sleeping")) assert response.status_code == 200 assert response.json().get("is_sleeping") is False + + # check sleep metrics + response = requests.get(remote_server.url_for("metrics")) + assert response.status_code == 200 + awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response) + assert awake == 1 + assert weights_offloaded == 0 + assert discard_all == 0 + + +def _get_sleep_metrics_from_api(response: requests.Response): + """Return (awake, weights_offloaded, discard_all)""" + + awake, weights_offloaded, discard_all = None, None, None + + for family in text_string_to_metric_families(response.text): + if family.name == "vllm:engine_sleep_state": + for sample in family.samples: + if sample.name == "vllm:engine_sleep_state": + for label_name, label_value in sample.labels.items(): + if label_value == "awake": + awake = sample.value + elif label_value == "weights_offloaded": + weights_offloaded = sample.value + elif label_value == "discard_all": + discard_all = sample.value + + assert awake is not None + assert weights_offloaded is not None + assert discard_all is not None + + return awake, weights_offloaded, discard_all diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index cf458a8f074c0..761c37504d80a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -689,9 +689,15 @@ class AsyncLLM(EngineClient): await self.reset_prefix_cache() await self.engine_core.sleep_async(level) + if self.logger_manager is not None: + self.logger_manager.record_sleep_state(1, level) + async def wake_up(self, tags: list[str] | None = None) -> None: await self.engine_core.wake_up_async(tags) + if self.logger_manager is not None: + self.logger_manager.record_sleep_state(0, 0) + async def is_sleeping(self) -> bool: return await self.engine_core.is_sleeping_async() diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 486dacb2e5d9c..0fce343702e0a 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -332,9 +332,15 @@ class LLMEngine: def sleep(self, level: int = 1): self.engine_core.sleep(level) + if self.logger_manager is not None: + self.logger_manager.record_sleep_state(1, level) + def wake_up(self, tags: list[str] | None = None): self.engine_core.wake_up(tags) + if self.logger_manager is not None: + self.logger_manager.record_sleep_state(0, 0) + def is_sleeping(self) -> bool: return self.engine_core.is_sleeping() diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index c5d7885eefb79..055da5d856b25 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -9,6 +9,7 @@ from typing import TypeAlias from prometheus_client import Counter, Gauge, Histogram +import vllm.envs as envs from vllm.config import SupportsMetricsInfo, VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorLogging from vllm.logger import init_logger @@ -56,6 +57,9 @@ class StatLoggerBase(ABC): def log(self): # noqa pass + def record_sleep_state(self, is_awake: int, level: int): # noqa + pass + def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]: factories: list[StatLoggerFactory] = [] @@ -384,8 +388,33 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.gauge_scheduler_waiting = make_per_engine( gauge_scheduler_waiting, engine_indexes, model_name ) + if envs.VLLM_SERVER_DEV_MODE: + gauge_engine_sleep_state = self._gauge_cls( + name="vllm:engine_sleep_state", + documentation=( + "Engine sleep state; awake = 0 means engine is sleeping; " + "awake = 1 means engine is awake; " + "weights_offloaded = 1 means sleep level 1; " + "discard_all = 1 means sleep level 2." + ), + labelnames=labelnames + ["sleep_state"], + multiprocess_mode="mostrecent", + ) + + self.gauge_engine_sleep_state = {} + sleep_state = ["awake", "weights_offloaded", "discard_all"] + + for s in sleep_state: + self.gauge_engine_sleep_state[s] = { + idx: gauge_engine_sleep_state.labels( + engine=idx, model_name=model_name, sleep_state=s + ) + for idx in engine_indexes + } + + # Setting default values + self.record_sleep_state() - # # GPU cache # # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc @@ -1010,6 +1039,25 @@ class PrometheusStatLogger(AggregateStatLoggerBase): } self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time() + def record_sleep_state(self, sleep: int = 0, level: int = 0): + awake = 1 + discard_all = 0 + weights_offloaded = 0 + + if sleep == 1: + awake = 0 + if level == 1: + weights_offloaded = 1 + elif level == 2: + discard_all = 1 + + for engine_idx in self.engine_indexes: + self.gauge_engine_sleep_state["discard_all"][engine_idx].set(discard_all) + self.gauge_engine_sleep_state["weights_offloaded"][engine_idx].set( + weights_offloaded + ) + self.gauge_engine_sleep_state["awake"][engine_idx].set(awake) + def log_engine_initialized(self): self.log_metrics_info("cache_config", self.vllm_config.cache_config) @@ -1131,6 +1179,10 @@ class StatLoggerManager: engine_idx=engine_idx, ) + def record_sleep_state(self, sleep: int = 0, level: int = 0): + for logger in self.stat_loggers: + logger.record_sleep_state(sleep, level) + def log(self): for logger in self.stat_loggers: logger.log()