mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 07:57:45 +08:00
[Metrics] Hide deprecated metrics with gpu_ prefix (#24245)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
This commit is contained in:
parent
3c96e7b8a1
commit
2942970d44
@ -232,6 +232,9 @@ EXPECTED_METRICS_V1 = [
|
|||||||
"vllm:gpu_cache_usage_perc",
|
"vllm:gpu_cache_usage_perc",
|
||||||
"vllm:gpu_prefix_cache_queries",
|
"vllm:gpu_prefix_cache_queries",
|
||||||
"vllm:gpu_prefix_cache_hits",
|
"vllm:gpu_prefix_cache_hits",
|
||||||
|
"vllm:kv_cache_usage_perc",
|
||||||
|
"vllm:prefix_cache_queries",
|
||||||
|
"vllm:prefix_cache_hits",
|
||||||
"vllm:num_preemptions_total",
|
"vllm:num_preemptions_total",
|
||||||
"vllm:prompt_tokens_total",
|
"vllm:prompt_tokens_total",
|
||||||
"vllm:generation_tokens_total",
|
"vllm:generation_tokens_total",
|
||||||
@ -277,6 +280,9 @@ EXPECTED_METRICS_V1 = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
HIDDEN_DEPRECATED_METRICS: list[str] = [
|
HIDDEN_DEPRECATED_METRICS: list[str] = [
|
||||||
|
"vllm:gpu_cache_usage_perc",
|
||||||
|
"vllm:gpu_prefix_cache_queries",
|
||||||
|
"vllm:gpu_prefix_cache_hits",
|
||||||
"vllm:time_per_output_token_seconds_sum",
|
"vllm:time_per_output_token_seconds_sum",
|
||||||
"vllm:time_per_output_token_seconds_bucket",
|
"vllm:time_per_output_token_seconds_bucket",
|
||||||
"vllm:time_per_output_token_seconds_count",
|
"vllm:time_per_output_token_seconds_count",
|
||||||
@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
|
|||||||
client: openai.AsyncClient, use_v1: bool):
|
client: openai.AsyncClient, use_v1: bool):
|
||||||
|
|
||||||
running_requests, waiting_requests, kv_cache_usage = (
|
running_requests, waiting_requests, kv_cache_usage = (
|
||||||
_get_running_metrics_from_api(server))
|
_get_running_metrics_from_api(server, use_v1))
|
||||||
|
|
||||||
# Expect no running requests or kvcache usage
|
# Expect no running requests or kvcache usage
|
||||||
assert running_requests == 0
|
assert running_requests == 0
|
||||||
@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
|
|||||||
|
|
||||||
# Check that we have running requests
|
# Check that we have running requests
|
||||||
running_requests, waiting_requests, kv_cache_usage = (
|
running_requests, waiting_requests, kv_cache_usage = (
|
||||||
_get_running_metrics_from_api(server))
|
_get_running_metrics_from_api(server, use_v1))
|
||||||
|
|
||||||
# Expect running requests and kvcache usage
|
# Expect running requests and kvcache usage
|
||||||
assert running_requests > 0
|
assert running_requests > 0
|
||||||
@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
|
|||||||
|
|
||||||
# Verify running and waiting requests counts and KV cache usage are zero
|
# Verify running and waiting requests counts and KV cache usage are zero
|
||||||
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
|
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
|
||||||
_get_running_metrics_from_api(server))
|
_get_running_metrics_from_api(server, use_v1))
|
||||||
|
|
||||||
assert running_requests_after == 0,\
|
assert running_requests_after == 0,\
|
||||||
(f"Expected 0 running requests after abort, got "
|
(f"Expected 0 running requests after abort, got "
|
||||||
@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
|
|||||||
f"{kv_cache_usage_after}")
|
f"{kv_cache_usage_after}")
|
||||||
|
|
||||||
|
|
||||||
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
|
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
|
||||||
"""Return (running_count, waiting_count, kv_cache_usage)"""
|
"""Return (running_count, waiting_count, kv_cache_usage)"""
|
||||||
|
|
||||||
response = requests.get(server.url_for("metrics"))
|
response = requests.get(server.url_for("metrics"))
|
||||||
@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
|
|||||||
# Verify running and waiting requests counts and KV cache usage are zero
|
# Verify running and waiting requests counts and KV cache usage are zero
|
||||||
running_requests, waiting_requests, kv_cache_usage = None, None, None
|
running_requests, waiting_requests, kv_cache_usage = None, None, None
|
||||||
|
|
||||||
|
kv_cache_usage_metric = ("vllm:kv_cache_usage_perc"
|
||||||
|
if use_v1 else "vllm:gpu_cache_usage_perc")
|
||||||
|
|
||||||
for family in text_string_to_metric_families(response.text):
|
for family in text_string_to_metric_families(response.text):
|
||||||
if family.name == "vllm:num_requests_running":
|
if family.name == "vllm:num_requests_running":
|
||||||
for sample in family.samples:
|
for sample in family.samples:
|
||||||
@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
|
|||||||
if sample.name == "vllm:num_requests_waiting":
|
if sample.name == "vllm:num_requests_waiting":
|
||||||
waiting_requests = sample.value
|
waiting_requests = sample.value
|
||||||
break
|
break
|
||||||
elif family.name == "vllm:gpu_cache_usage_perc":
|
elif family.name == kv_cache_usage_metric:
|
||||||
for sample in family.samples:
|
for sample in family.samples:
|
||||||
if sample.name == "vllm:gpu_cache_usage_perc":
|
if sample.name == kv_cache_usage_metric:
|
||||||
kv_cache_usage = sample.value
|
kv_cache_usage = sample.value
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|||||||
@ -202,8 +202,10 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
#
|
#
|
||||||
# GPU cache
|
# GPU cache
|
||||||
#
|
#
|
||||||
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
|
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
|
||||||
# TODO: in 0.10, only enable if show_hidden_metrics=True
|
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
|
||||||
|
# TODO: remove in 0.12.0
|
||||||
|
if self.show_hidden_metrics:
|
||||||
gauge_gpu_cache_usage = self._gauge_cls(
|
gauge_gpu_cache_usage = self._gauge_cls(
|
||||||
name="vllm:gpu_cache_usage_perc",
|
name="vllm:gpu_cache_usage_perc",
|
||||||
documentation=(
|
documentation=(
|
||||||
@ -211,23 +213,27 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
|
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
|
||||||
multiprocess_mode="mostrecent",
|
multiprocess_mode="mostrecent",
|
||||||
labelnames=labelnames)
|
labelnames=labelnames)
|
||||||
self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage,
|
self.gauge_gpu_cache_usage = make_per_engine(
|
||||||
engine_indexes,
|
gauge_gpu_cache_usage, engine_indexes, model_name)
|
||||||
model_name)
|
|
||||||
|
|
||||||
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
|
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
|
||||||
# TODO: in 0.10, only enable if show_hidden_metrics=True
|
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
|
||||||
|
# TODO: remove in 0.12.0
|
||||||
|
if self.show_hidden_metrics:
|
||||||
counter_gpu_prefix_cache_queries = self._counter_cls(
|
counter_gpu_prefix_cache_queries = self._counter_cls(
|
||||||
name="vllm:gpu_prefix_cache_queries",
|
name="vllm:gpu_prefix_cache_queries",
|
||||||
documentation=(
|
documentation=(
|
||||||
"GPU prefix cache queries, in terms of number of queried"
|
"GPU prefix cache queries, in terms of number of queried"
|
||||||
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."),
|
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
|
||||||
|
),
|
||||||
labelnames=labelnames)
|
labelnames=labelnames)
|
||||||
self.counter_gpu_prefix_cache_queries = make_per_engine(
|
self.counter_gpu_prefix_cache_queries = make_per_engine(
|
||||||
counter_gpu_prefix_cache_queries, engine_indexes, model_name)
|
counter_gpu_prefix_cache_queries, engine_indexes, model_name)
|
||||||
|
|
||||||
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
|
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
|
||||||
# TODO: in 0.10, only enable if show_hidden_metrics=True
|
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
|
||||||
|
# TODO: remove in 0.12.0
|
||||||
|
if self.show_hidden_metrics:
|
||||||
counter_gpu_prefix_cache_hits = self._counter_cls(
|
counter_gpu_prefix_cache_hits = self._counter_cls(
|
||||||
name="vllm:gpu_prefix_cache_hits",
|
name="vllm:gpu_prefix_cache_hits",
|
||||||
documentation=(
|
documentation=(
|
||||||
@ -509,11 +515,13 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
self.gauge_scheduler_waiting[engine_idx].set(
|
self.gauge_scheduler_waiting[engine_idx].set(
|
||||||
scheduler_stats.num_waiting_reqs)
|
scheduler_stats.num_waiting_reqs)
|
||||||
|
|
||||||
|
if self.show_hidden_metrics:
|
||||||
self.gauge_gpu_cache_usage[engine_idx].set(
|
self.gauge_gpu_cache_usage[engine_idx].set(
|
||||||
scheduler_stats.kv_cache_usage)
|
scheduler_stats.kv_cache_usage)
|
||||||
self.gauge_kv_cache_usage[engine_idx].set(
|
self.gauge_kv_cache_usage[engine_idx].set(
|
||||||
scheduler_stats.kv_cache_usage)
|
scheduler_stats.kv_cache_usage)
|
||||||
|
|
||||||
|
if self.show_hidden_metrics:
|
||||||
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
|
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
|
||||||
scheduler_stats.prefix_cache_stats.queries)
|
scheduler_stats.prefix_cache_stats.queries)
|
||||||
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
|
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user