vllm/tests/entrypoints/openai/test_sleep.py
Braulio Dumba 1da3309ace
[Core] Exposing engine sleep & wake_up state as prometheus metrics (#24176)
Signed-off-by: Braulio Dumba <Braulio.Dumba@ibm.com>
2025-10-29 09:32:01 -07:00

111 lines
4.0 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import requests
from prometheus_client.parser import text_string_to_metric_families
from ...utils import RemoteOpenAIServer
MODEL_NAME = "meta-llama/Llama-3.2-1B"
def test_sleep_mode():
# dtype, max-len etc set so that this can run in CI
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--max-num-seqs",
"128",
"--enable-sleep-mode",
]
with RemoteOpenAIServer(
MODEL_NAME,
args,
env_dict={"VLLM_SERVER_DEV_MODE": "1", "CUDA_VISIBLE_DEVICES": "0"},
) as remote_server:
response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
assert response.status_code == 200
response = requests.get(remote_server.url_for("is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is True
# check sleep metrics
response = requests.get(remote_server.url_for("metrics"))
assert response.status_code == 200
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
assert awake == 0
assert weights_offloaded == 1
assert discard_all == 0
response = requests.post(remote_server.url_for("wake_up"))
assert response.status_code == 200
response = requests.get(remote_server.url_for("is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is False
# check sleep metrics
response = requests.get(remote_server.url_for("metrics"))
assert response.status_code == 200
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
assert awake == 1
assert weights_offloaded == 0
assert discard_all == 0
# test wake up with tags
response = requests.post(remote_server.url_for("sleep"), params={"level": "1"})
assert response.status_code == 200
response = requests.post(
remote_server.url_for("wake_up"), params={"tags": ["weights"]}
)
assert response.status_code == 200
# is sleeping should be false after waking up any part of the engine
response = requests.get(remote_server.url_for("is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is True
response = requests.post(
remote_server.url_for("wake_up"), params={"tags": ["kv_cache"]}
)
assert response.status_code == 200
response = requests.get(remote_server.url_for("is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is False
# check sleep metrics
response = requests.get(remote_server.url_for("metrics"))
assert response.status_code == 200
awake, weights_offloaded, discard_all = _get_sleep_metrics_from_api(response)
assert awake == 1
assert weights_offloaded == 0
assert discard_all == 0
def _get_sleep_metrics_from_api(response: requests.Response):
"""Return (awake, weights_offloaded, discard_all)"""
awake, weights_offloaded, discard_all = None, None, None
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:engine_sleep_state":
for sample in family.samples:
if sample.name == "vllm:engine_sleep_state":
for label_name, label_value in sample.labels.items():
if label_value == "awake":
awake = sample.value
elif label_value == "weights_offloaded":
weights_offloaded = sample.value
elif label_value == "discard_all":
discard_all = sample.value
assert awake is not None
assert weights_offloaded is not None
assert discard_all is not None
return awake, weights_offloaded, discard_all