mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-18 03:55:29 +08:00
[Core] Exposing engine sleep & wake_up state as prometheus metrics (#24176)
Signed-off-by: Braulio Dumba <Braulio.Dumba@ibm.com>
This commit is contained in:
parent
5522fb274b
commit
1da3309ace
@ -2,6 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from prometheus_client.parser import text_string_to_metric_families
|
||||||
|
|
||||||
from ...utils import RemoteOpenAIServer
|
from ...utils import RemoteOpenAIServer
|
||||||
|
|
||||||
@ -31,12 +32,28 @@ def test_sleep_mode():
|
|||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.json().get("is_sleeping") is True
|
assert response.json().get("is_sleeping") is True
|
||||||
|
|
||||||
|
# check sleep metrics
|
||||||
|
response = requests.get(remote_server.url_for("metrics"))
|
||||||
|
assert response.status_code == 200
|
||||||
|
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
|
||||||
|
assert awake == 0
|
||||||
|
assert weights_offloaded == 1
|
||||||
|
assert discard_all == 0
|
||||||
|
|
||||||
response = requests.post(remote_server.url_for("wake_up"))
|
response = requests.post(remote_server.url_for("wake_up"))
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
response = requests.get(remote_server.url_for("is_sleeping"))
|
response = requests.get(remote_server.url_for("is_sleeping"))
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.json().get("is_sleeping") is False
|
assert response.json().get("is_sleeping") is False
|
||||||
|
|
||||||
|
# check sleep metrics
|
||||||
|
response = requests.get(remote_server.url_for("metrics"))
|
||||||
|
assert response.status_code == 200
|
||||||
|
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
|
||||||
|
assert awake == 1
|
||||||
|
assert weights_offloaded == 0
|
||||||
|
assert discard_all == 0
|
||||||
|
|
||||||
# test wake up with tags
|
# test wake up with tags
|
||||||
response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
|
response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
@ -59,3 +76,35 @@ def test_sleep_mode():
|
|||||||
response = requests.get(remote_server.url_for("is_sleeping"))
|
response = requests.get(remote_server.url_for("is_sleeping"))
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.json().get("is_sleeping") is False
|
assert response.json().get("is_sleeping") is False
|
||||||
|
|
||||||
|
# check sleep metrics
|
||||||
|
response = requests.get(remote_server.url_for("metrics"))
|
||||||
|
assert response.status_code == 200
|
||||||
|
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
|
||||||
|
assert awake == 1
|
||||||
|
assert weights_offloaded == 0
|
||||||
|
assert discard_all == 0
|
||||||
|
|
||||||
|
|
||||||
|
def _get_sleep_metrics_from_api(response: requests.Response):
|
||||||
|
"""Return (awake, weights_offloaded, discard_all)"""
|
||||||
|
|
||||||
|
awake, weights_offloaded, discard_all = None, None, None
|
||||||
|
|
||||||
|
for family in text_string_to_metric_families(response.text):
|
||||||
|
if family.name == "vllm:engine_sleep_state":
|
||||||
|
for sample in family.samples:
|
||||||
|
if sample.name == "vllm:engine_sleep_state":
|
||||||
|
for label_name, label_value in sample.labels.items():
|
||||||
|
if label_value == "awake":
|
||||||
|
awake = sample.value
|
||||||
|
elif label_value == "weights_offloaded":
|
||||||
|
weights_offloaded = sample.value
|
||||||
|
elif label_value == "discard_all":
|
||||||
|
discard_all = sample.value
|
||||||
|
|
||||||
|
assert awake is not None
|
||||||
|
assert weights_offloaded is not None
|
||||||
|
assert discard_all is not None
|
||||||
|
|
||||||
|
return awake, weights_offloaded, discard_all
|
||||||
|
|||||||
@ -689,9 +689,15 @@ class AsyncLLM(EngineClient):
|
|||||||
await self.reset_prefix_cache()
|
await self.reset_prefix_cache()
|
||||||
await self.engine_core.sleep_async(level)
|
await self.engine_core.sleep_async(level)
|
||||||
|
|
||||||
|
if self.logger_manager is not None:
|
||||||
|
self.logger_manager.record_sleep_state(1, level)
|
||||||
|
|
||||||
async def wake_up(self, tags: list[str] | None = None) -> None:
|
async def wake_up(self, tags: list[str] | None = None) -> None:
|
||||||
await self.engine_core.wake_up_async(tags)
|
await self.engine_core.wake_up_async(tags)
|
||||||
|
|
||||||
|
if self.logger_manager is not None:
|
||||||
|
self.logger_manager.record_sleep_state(0, 0)
|
||||||
|
|
||||||
async def is_sleeping(self) -> bool:
|
async def is_sleeping(self) -> bool:
|
||||||
return await self.engine_core.is_sleeping_async()
|
return await self.engine_core.is_sleeping_async()
|
||||||
|
|
||||||
|
|||||||
@ -332,9 +332,15 @@ class LLMEngine:
|
|||||||
def sleep(self, level: int = 1):
|
def sleep(self, level: int = 1):
|
||||||
self.engine_core.sleep(level)
|
self.engine_core.sleep(level)
|
||||||
|
|
||||||
|
if self.logger_manager is not None:
|
||||||
|
self.logger_manager.record_sleep_state(1, level)
|
||||||
|
|
||||||
def wake_up(self, tags: list[str] | None = None):
|
def wake_up(self, tags: list[str] | None = None):
|
||||||
self.engine_core.wake_up(tags)
|
self.engine_core.wake_up(tags)
|
||||||
|
|
||||||
|
if self.logger_manager is not None:
|
||||||
|
self.logger_manager.record_sleep_state(0, 0)
|
||||||
|
|
||||||
def is_sleeping(self) -> bool:
|
def is_sleeping(self) -> bool:
|
||||||
return self.engine_core.is_sleeping()
|
return self.engine_core.is_sleeping()
|
||||||
|
|
||||||
|
|||||||
@ -9,6 +9,7 @@ from typing import TypeAlias
|
|||||||
|
|
||||||
from prometheus_client import Counter, Gauge, Histogram
|
from prometheus_client import Counter, Gauge, Histogram
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from vllm.config import SupportsMetricsInfo, VllmConfig
|
from vllm.config import SupportsMetricsInfo, VllmConfig
|
||||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorLogging
|
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorLogging
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@ -56,6 +57,9 @@ class StatLoggerBase(ABC):
|
|||||||
def log(self): # noqa
|
def log(self): # noqa
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def record_sleep_state(self, is_awake: int, level: int): # noqa
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]:
|
def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]:
|
||||||
factories: list[StatLoggerFactory] = []
|
factories: list[StatLoggerFactory] = []
|
||||||
@ -384,8 +388,33 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
|||||||
self.gauge_scheduler_waiting = make_per_engine(
|
self.gauge_scheduler_waiting = make_per_engine(
|
||||||
gauge_scheduler_waiting, engine_indexes, model_name
|
gauge_scheduler_waiting, engine_indexes, model_name
|
||||||
)
|
)
|
||||||
|
if envs.VLLM_SERVER_DEV_MODE:
|
||||||
|
gauge_engine_sleep_state = self._gauge_cls(
|
||||||
|
name="vllm:engine_sleep_state",
|
||||||
|
documentation=(
|
||||||
|
"Engine sleep state; awake = 0 means engine is sleeping; "
|
||||||
|
"awake = 1 means engine is awake; "
|
||||||
|
"weights_offloaded = 1 means sleep level 1; "
|
||||||
|
"discard_all = 1 means sleep level 2."
|
||||||
|
),
|
||||||
|
labelnames=labelnames + ["sleep_state"],
|
||||||
|
multiprocess_mode="mostrecent",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.gauge_engine_sleep_state = {}
|
||||||
|
sleep_state = ["awake", "weights_offloaded", "discard_all"]
|
||||||
|
|
||||||
|
for s in sleep_state:
|
||||||
|
self.gauge_engine_sleep_state[s] = {
|
||||||
|
idx: gauge_engine_sleep_state.labels(
|
||||||
|
engine=idx, model_name=model_name, sleep_state=s
|
||||||
|
)
|
||||||
|
for idx in engine_indexes
|
||||||
|
}
|
||||||
|
|
||||||
|
# Setting default values
|
||||||
|
self.record_sleep_state()
|
||||||
|
|
||||||
#
|
|
||||||
# GPU cache
|
# GPU cache
|
||||||
#
|
#
|
||||||
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
|
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
|
||||||
@ -1010,6 +1039,25 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
|||||||
}
|
}
|
||||||
self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time()
|
self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time()
|
||||||
|
|
||||||
|
def record_sleep_state(self, sleep: int = 0, level: int = 0):
|
||||||
|
awake = 1
|
||||||
|
discard_all = 0
|
||||||
|
weights_offloaded = 0
|
||||||
|
|
||||||
|
if sleep == 1:
|
||||||
|
awake = 0
|
||||||
|
if level == 1:
|
||||||
|
weights_offloaded = 1
|
||||||
|
elif level == 2:
|
||||||
|
discard_all = 1
|
||||||
|
|
||||||
|
for engine_idx in self.engine_indexes:
|
||||||
|
self.gauge_engine_sleep_state["discard_all"][engine_idx].set(discard_all)
|
||||||
|
self.gauge_engine_sleep_state["weights_offloaded"][engine_idx].set(
|
||||||
|
weights_offloaded
|
||||||
|
)
|
||||||
|
self.gauge_engine_sleep_state["awake"][engine_idx].set(awake)
|
||||||
|
|
||||||
def log_engine_initialized(self):
|
def log_engine_initialized(self):
|
||||||
self.log_metrics_info("cache_config", self.vllm_config.cache_config)
|
self.log_metrics_info("cache_config", self.vllm_config.cache_config)
|
||||||
|
|
||||||
@ -1131,6 +1179,10 @@ class StatLoggerManager:
|
|||||||
engine_idx=engine_idx,
|
engine_idx=engine_idx,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def record_sleep_state(self, sleep: int = 0, level: int = 0):
|
||||||
|
for logger in self.stat_loggers:
|
||||||
|
logger.record_sleep_state(sleep, level)
|
||||||
|
|
||||||
def log(self):
|
def log(self):
|
||||||
for logger in self.stat_loggers:
|
for logger in self.stat_loggers:
|
||||||
logger.log()
|
logger.log()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user