[V1][Metrics] Deprecate metrics with gpu_ prefix for non GPU specific metrics. (#18354)

Signed-off-by: Saheli Bhattacharjee <saheli@krai.ai>
This commit is contained in:
Saheli Bhattacharjee 2025-06-14 04:07:36 +01:00 committed by GitHub
parent bd517eb9fe
commit d1e34cc9ac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 41 additions and 8 deletions

View File

@ -947,7 +947,7 @@ class Scheduler(SchedulerInterface):
return SchedulerStats( return SchedulerStats(
num_running_reqs=len(self.running), num_running_reqs=len(self.running),
num_waiting_reqs=len(self.waiting), num_waiting_reqs=len(self.waiting),
gpu_cache_usage=self.kv_cache_manager.usage, kv_cache_usage=self.kv_cache_manager.usage,
prefix_cache_stats=prefix_cache_stats, prefix_cache_stats=prefix_cache_stats,
spec_decoding_stats=spec_decoding_stats, spec_decoding_stats=spec_decoding_stats,
) )

View File

@ -127,7 +127,7 @@ class LoggingStatLogger(StatLoggerBase):
generation_throughput, generation_throughput,
scheduler_stats.num_running_reqs, scheduler_stats.num_running_reqs,
scheduler_stats.num_waiting_reqs, scheduler_stats.num_waiting_reqs,
scheduler_stats.gpu_cache_usage * 100, scheduler_stats.kv_cache_usage * 100,
self.prefix_caching_metrics.hit_rate * 100, self.prefix_caching_metrics.hit_rate * 100,
) )
self.spec_decoding_logging.log(log_fn=log_fn) self.spec_decoding_logging.log(log_fn=log_fn)
@ -185,22 +185,49 @@ class PrometheusStatLogger(StatLoggerBase):
# #
# GPU cache # GPU cache
# #
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
# TODO: in 0.10, only enable if show_hidden_metrics=True
self.gauge_gpu_cache_usage = self._gauge_cls( self.gauge_gpu_cache_usage = self._gauge_cls(
name="vllm:gpu_cache_usage_perc", name="vllm:gpu_cache_usage_perc",
documentation="GPU KV-cache usage. 1 means 100 percent usage.", documentation=(
"GPU KV-cache usage. 1 means 100 percent usage."
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
multiprocess_mode="mostrecent", multiprocess_mode="mostrecent",
labelnames=labelnames).labels(*labelvalues) labelnames=labelnames).labels(*labelvalues)
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
# TODO: in 0.10, only enable if show_hidden_metrics=True
self.counter_gpu_prefix_cache_queries = self._counter_cls( self.counter_gpu_prefix_cache_queries = self._counter_cls(
name="vllm:gpu_prefix_cache_queries", name="vllm:gpu_prefix_cache_queries",
documentation= documentation=
"GPU prefix cache queries, in terms of number of queried tokens.", ("GPU prefix cache queries, in terms of number of queried tokens."
"DEPRECATED: Use vllm:prefix_cache_queries instead."),
labelnames=labelnames).labels(*labelvalues) labelnames=labelnames).labels(*labelvalues)
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
# TODO: in 0.10, only enable if show_hidden_metrics=True
self.counter_gpu_prefix_cache_hits = self._counter_cls( self.counter_gpu_prefix_cache_hits = self._counter_cls(
name="vllm:gpu_prefix_cache_hits", name="vllm:gpu_prefix_cache_hits",
documentation= documentation=(
"GPU prefix cache hits, in terms of number of cached tokens.", "GPU prefix cache hits, in terms of number of cached tokens."
"DEPRECATED: Use vllm:prefix_cache_hits instead."),
labelnames=labelnames).labels(*labelvalues)
self.gauge_kv_cache_usage = self._gauge_cls(
name="vllm:kv_cache_usage_perc",
documentation="KV-cache usage. 1 means 100 percent usage.",
labelnames=labelnames).labels(*labelvalues)
self.counter_prefix_cache_queries = self._counter_cls(
name="vllm:prefix_cache_queries",
documentation=(
"Prefix cache queries, in terms of number of queried tokens."),
labelnames=labelnames).labels(*labelvalues)
self.counter_prefix_cache_hits = self._counter_cls(
name="vllm:prefix_cache_hits",
documentation=(
"Prefix cache hits, in terms of number of cached tokens."),
labelnames=labelnames).labels(*labelvalues) labelnames=labelnames).labels(*labelvalues)
# #
@ -400,13 +427,19 @@ class PrometheusStatLogger(StatLoggerBase):
self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) self.gauge_gpu_cache_usage.set(scheduler_stats.kv_cache_usage)
self.gauge_kv_cache_usage.set(scheduler_stats.kv_cache_usage)
self.counter_gpu_prefix_cache_queries.inc( self.counter_gpu_prefix_cache_queries.inc(
scheduler_stats.prefix_cache_stats.queries) scheduler_stats.prefix_cache_stats.queries)
self.counter_gpu_prefix_cache_hits.inc( self.counter_gpu_prefix_cache_hits.inc(
scheduler_stats.prefix_cache_stats.hits) scheduler_stats.prefix_cache_stats.hits)
self.counter_prefix_cache_queries.inc(
scheduler_stats.prefix_cache_stats.queries)
self.counter_prefix_cache_hits.inc(
scheduler_stats.prefix_cache_stats.hits)
if scheduler_stats.spec_decoding_stats is not None: if scheduler_stats.spec_decoding_stats is not None:
self.spec_decoding_prom.observe( self.spec_decoding_prom.observe(
scheduler_stats.spec_decoding_stats) scheduler_stats.spec_decoding_stats)

View File

@ -33,7 +33,7 @@ class SchedulerStats:
num_running_reqs: int = 0 num_running_reqs: int = 0
num_waiting_reqs: int = 0 num_waiting_reqs: int = 0
gpu_cache_usage: float = 0.0 kv_cache_usage: float = 0.0
prefix_cache_stats: PrefixCacheStats = field( prefix_cache_stats: PrefixCacheStats = field(
default_factory=PrefixCacheStats) default_factory=PrefixCacheStats)