diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d4eb8b8d622f1..2d2274ab6a4d5 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -947,7 +947,7 @@ class Scheduler(SchedulerInterface): return SchedulerStats( num_running_reqs=len(self.running), num_waiting_reqs=len(self.waiting), - gpu_cache_usage=self.kv_cache_manager.usage, + kv_cache_usage=self.kv_cache_manager.usage, prefix_cache_stats=prefix_cache_stats, spec_decoding_stats=spec_decoding_stats, ) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 2d621ec31038f..11865a0fd1f27 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -127,7 +127,7 @@ class LoggingStatLogger(StatLoggerBase): generation_throughput, scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, - scheduler_stats.gpu_cache_usage * 100, + scheduler_stats.kv_cache_usage * 100, self.prefix_caching_metrics.hit_rate * 100, ) self.spec_decoding_logging.log(log_fn=log_fn) @@ -185,22 +185,49 @@ class PrometheusStatLogger(StatLoggerBase): # # GPU cache # + # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc + # TODO: in 0.10, only enable if show_hidden_metrics=True self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", - documentation="GPU KV-cache usage. 1 means 100 percent usage.", + documentation=( + "GPU KV-cache usage. 1 means 100 percent usage." + "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), multiprocess_mode="mostrecent", labelnames=labelnames).labels(*labelvalues) + # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries + # TODO: in 0.10, only enable if show_hidden_metrics=True self.counter_gpu_prefix_cache_queries = self._counter_cls( name="vllm:gpu_prefix_cache_queries", documentation= - "GPU prefix cache queries, in terms of number of queried tokens.", + ("GPU prefix cache queries, in terms of number of queried tokens." + "DEPRECATED: Use vllm:prefix_cache_queries instead."), labelnames=labelnames).labels(*labelvalues) + # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits + # TODO: in 0.10, only enable if show_hidden_metrics=True self.counter_gpu_prefix_cache_hits = self._counter_cls( name="vllm:gpu_prefix_cache_hits", - documentation= - "GPU prefix cache hits, in terms of number of cached tokens.", + documentation=( + "GPU prefix cache hits, in terms of number of cached tokens." + "DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames).labels(*labelvalues) + + self.gauge_kv_cache_usage = self._gauge_cls( + name="vllm:kv_cache_usage_perc", + documentation="KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames).labels(*labelvalues) + + self.counter_prefix_cache_queries = self._counter_cls( + name="vllm:prefix_cache_queries", + documentation=( + "Prefix cache queries, in terms of number of queried tokens."), + labelnames=labelnames).labels(*labelvalues) + + self.counter_prefix_cache_hits = self._counter_cls( + name="vllm:prefix_cache_hits", + documentation=( + "Prefix cache hits, in terms of number of cached tokens."), labelnames=labelnames).labels(*labelvalues) # @@ -400,13 +427,19 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) - self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) + self.gauge_gpu_cache_usage.set(scheduler_stats.kv_cache_usage) + self.gauge_kv_cache_usage.set(scheduler_stats.kv_cache_usage) self.counter_gpu_prefix_cache_queries.inc( scheduler_stats.prefix_cache_stats.queries) self.counter_gpu_prefix_cache_hits.inc( scheduler_stats.prefix_cache_stats.hits) + self.counter_prefix_cache_queries.inc( + scheduler_stats.prefix_cache_stats.queries) + self.counter_prefix_cache_hits.inc( + scheduler_stats.prefix_cache_stats.hits) + if scheduler_stats.spec_decoding_stats is not None: self.spec_decoding_prom.observe( scheduler_stats.spec_decoding_stats) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 50c8b07fe54d2..4a5d5fac49d1d 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -33,7 +33,7 @@ class SchedulerStats: num_running_reqs: int = 0 num_waiting_reqs: int = 0 - gpu_cache_usage: float = 0.0 + kv_cache_usage: float = 0.0 prefix_cache_stats: PrefixCacheStats = field( default_factory=PrefixCacheStats)