From 2dfdfed8a0fe5517f8d4050740c251b1c1d35eeb Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Mon, 3 Mar 2025 18:25:46 +0000 Subject: [PATCH] [V0][Metrics] Deprecate some KV/prefix cache metrics (#14136) Signed-off-by: Mark McLoughlin --- vllm/engine/metrics.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 9379ba6146316..08be2cbc0b9d5 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -74,31 +74,51 @@ class Metrics: ], multiprocess_mode="livemostrecent", ) + + # Deprecated in 0.8 - KV cache offloading is not used in V1 + # TODO: in 0.9, only enable if show_hidden_metrics=True self.gauge_scheduler_swapped = self._gauge_cls( name="vllm:num_requests_swapped", - documentation="Number of requests swapped to CPU.", + documentation=( + "Number of requests swapped to CPU. " + "DEPRECATED: KV cache offloading is not used in V1"), labelnames=labelnames, multiprocess_mode="sum") + # KV Cache Usage in % self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", documentation="GPU KV-cache usage. 1 means 100 percent usage.", labelnames=labelnames, multiprocess_mode="sum") + + # Deprecated in 0.8 - KV cache offloading is not used in V1 + # TODO: in 0.9, only enable if show_hidden_metrics=True self.gauge_cpu_cache_usage = self._gauge_cls( name="vllm:cpu_cache_usage_perc", - documentation="CPU KV-cache usage. 1 means 100 percent usage.", + documentation=( + "CPU KV-cache usage. 1 means 100 percent usage. " + "DEPRECATED: KV cache offloading is not used in V1"), labelnames=labelnames, multiprocess_mode="sum") - # Prefix caching block hit rate + + # Deprecated in 0.8 - KV cache offloading is not used in V1 + # TODO: in 0.9, only enable if show_hidden_metrics=True self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls( name="vllm:cpu_prefix_cache_hit_rate", - documentation="CPU prefix cache block hit rate.", + documentation=( + "CPU prefix cache block hit rate. " + "DEPRECATED: KV cache offloading is not used in V1"), labelnames=labelnames, multiprocess_mode="sum") + + # Deprecated in 0.8 - replaced by queries+hits counters in V1 + # TODO: in 0.9, only enable if show_hidden_metrics=True self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls( name="vllm:gpu_prefix_cache_hit_rate", - documentation="GPU prefix cache block hit rate.", + documentation=("GPU prefix cache block hit rate. " + "DEPRECATED: use vllm:gpu_prefix_cache_queries and " + "vllm:gpu_prefix_cache_queries in V1"), labelnames=labelnames, multiprocess_mode="sum")