mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-18 01:15:01 +08:00
[V1][Metrics] Deprecate metrics with gpu_ prefix for non GPU specific metrics. (#18354)
Signed-off-by: Saheli Bhattacharjee <saheli@krai.ai>
This commit is contained in:
parent
bd517eb9fe
commit
d1e34cc9ac
@ -947,7 +947,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
return SchedulerStats(
|
return SchedulerStats(
|
||||||
num_running_reqs=len(self.running),
|
num_running_reqs=len(self.running),
|
||||||
num_waiting_reqs=len(self.waiting),
|
num_waiting_reqs=len(self.waiting),
|
||||||
gpu_cache_usage=self.kv_cache_manager.usage,
|
kv_cache_usage=self.kv_cache_manager.usage,
|
||||||
prefix_cache_stats=prefix_cache_stats,
|
prefix_cache_stats=prefix_cache_stats,
|
||||||
spec_decoding_stats=spec_decoding_stats,
|
spec_decoding_stats=spec_decoding_stats,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -127,7 +127,7 @@ class LoggingStatLogger(StatLoggerBase):
|
|||||||
generation_throughput,
|
generation_throughput,
|
||||||
scheduler_stats.num_running_reqs,
|
scheduler_stats.num_running_reqs,
|
||||||
scheduler_stats.num_waiting_reqs,
|
scheduler_stats.num_waiting_reqs,
|
||||||
scheduler_stats.gpu_cache_usage * 100,
|
scheduler_stats.kv_cache_usage * 100,
|
||||||
self.prefix_caching_metrics.hit_rate * 100,
|
self.prefix_caching_metrics.hit_rate * 100,
|
||||||
)
|
)
|
||||||
self.spec_decoding_logging.log(log_fn=log_fn)
|
self.spec_decoding_logging.log(log_fn=log_fn)
|
||||||
@ -185,22 +185,49 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
#
|
#
|
||||||
# GPU cache
|
# GPU cache
|
||||||
#
|
#
|
||||||
|
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
|
||||||
|
# TODO: in 0.10, only enable if show_hidden_metrics=True
|
||||||
self.gauge_gpu_cache_usage = self._gauge_cls(
|
self.gauge_gpu_cache_usage = self._gauge_cls(
|
||||||
name="vllm:gpu_cache_usage_perc",
|
name="vllm:gpu_cache_usage_perc",
|
||||||
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
|
documentation=(
|
||||||
|
"GPU KV-cache usage. 1 means 100 percent usage."
|
||||||
|
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
|
||||||
multiprocess_mode="mostrecent",
|
multiprocess_mode="mostrecent",
|
||||||
labelnames=labelnames).labels(*labelvalues)
|
labelnames=labelnames).labels(*labelvalues)
|
||||||
|
|
||||||
|
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
|
||||||
|
# TODO: in 0.10, only enable if show_hidden_metrics=True
|
||||||
self.counter_gpu_prefix_cache_queries = self._counter_cls(
|
self.counter_gpu_prefix_cache_queries = self._counter_cls(
|
||||||
name="vllm:gpu_prefix_cache_queries",
|
name="vllm:gpu_prefix_cache_queries",
|
||||||
documentation=
|
documentation=
|
||||||
"GPU prefix cache queries, in terms of number of queried tokens.",
|
("GPU prefix cache queries, in terms of number of queried tokens."
|
||||||
|
"DEPRECATED: Use vllm:prefix_cache_queries instead."),
|
||||||
labelnames=labelnames).labels(*labelvalues)
|
labelnames=labelnames).labels(*labelvalues)
|
||||||
|
|
||||||
|
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
|
||||||
|
# TODO: in 0.10, only enable if show_hidden_metrics=True
|
||||||
self.counter_gpu_prefix_cache_hits = self._counter_cls(
|
self.counter_gpu_prefix_cache_hits = self._counter_cls(
|
||||||
name="vllm:gpu_prefix_cache_hits",
|
name="vllm:gpu_prefix_cache_hits",
|
||||||
documentation=
|
documentation=(
|
||||||
"GPU prefix cache hits, in terms of number of cached tokens.",
|
"GPU prefix cache hits, in terms of number of cached tokens."
|
||||||
|
"DEPRECATED: Use vllm:prefix_cache_hits instead."),
|
||||||
|
labelnames=labelnames).labels(*labelvalues)
|
||||||
|
|
||||||
|
self.gauge_kv_cache_usage = self._gauge_cls(
|
||||||
|
name="vllm:kv_cache_usage_perc",
|
||||||
|
documentation="KV-cache usage. 1 means 100 percent usage.",
|
||||||
|
labelnames=labelnames).labels(*labelvalues)
|
||||||
|
|
||||||
|
self.counter_prefix_cache_queries = self._counter_cls(
|
||||||
|
name="vllm:prefix_cache_queries",
|
||||||
|
documentation=(
|
||||||
|
"Prefix cache queries, in terms of number of queried tokens."),
|
||||||
|
labelnames=labelnames).labels(*labelvalues)
|
||||||
|
|
||||||
|
self.counter_prefix_cache_hits = self._counter_cls(
|
||||||
|
name="vllm:prefix_cache_hits",
|
||||||
|
documentation=(
|
||||||
|
"Prefix cache hits, in terms of number of cached tokens."),
|
||||||
labelnames=labelnames).labels(*labelvalues)
|
labelnames=labelnames).labels(*labelvalues)
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -400,13 +427,19 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
|
self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
|
||||||
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
|
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
|
||||||
|
|
||||||
self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
|
self.gauge_gpu_cache_usage.set(scheduler_stats.kv_cache_usage)
|
||||||
|
self.gauge_kv_cache_usage.set(scheduler_stats.kv_cache_usage)
|
||||||
|
|
||||||
self.counter_gpu_prefix_cache_queries.inc(
|
self.counter_gpu_prefix_cache_queries.inc(
|
||||||
scheduler_stats.prefix_cache_stats.queries)
|
scheduler_stats.prefix_cache_stats.queries)
|
||||||
self.counter_gpu_prefix_cache_hits.inc(
|
self.counter_gpu_prefix_cache_hits.inc(
|
||||||
scheduler_stats.prefix_cache_stats.hits)
|
scheduler_stats.prefix_cache_stats.hits)
|
||||||
|
|
||||||
|
self.counter_prefix_cache_queries.inc(
|
||||||
|
scheduler_stats.prefix_cache_stats.queries)
|
||||||
|
self.counter_prefix_cache_hits.inc(
|
||||||
|
scheduler_stats.prefix_cache_stats.hits)
|
||||||
|
|
||||||
if scheduler_stats.spec_decoding_stats is not None:
|
if scheduler_stats.spec_decoding_stats is not None:
|
||||||
self.spec_decoding_prom.observe(
|
self.spec_decoding_prom.observe(
|
||||||
scheduler_stats.spec_decoding_stats)
|
scheduler_stats.spec_decoding_stats)
|
||||||
|
|||||||
@ -33,7 +33,7 @@ class SchedulerStats:
|
|||||||
num_running_reqs: int = 0
|
num_running_reqs: int = 0
|
||||||
num_waiting_reqs: int = 0
|
num_waiting_reqs: int = 0
|
||||||
|
|
||||||
gpu_cache_usage: float = 0.0
|
kv_cache_usage: float = 0.0
|
||||||
|
|
||||||
prefix_cache_stats: PrefixCacheStats = field(
|
prefix_cache_stats: PrefixCacheStats = field(
|
||||||
default_factory=PrefixCacheStats)
|
default_factory=PrefixCacheStats)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user