From 2dfdfed8a0fe5517f8d4050740c251b1c1d35eeb Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 3 Mar 2025 18:25:46 +0000
Subject: [PATCH] [V0][Metrics] Deprecate some KV/prefix cache metrics (#14136)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 vllm/engine/metrics.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 9379ba6146316..08be2cbc0b9d5 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -74,31 +74,51 @@ class Metrics:
             ],
             multiprocess_mode="livemostrecent",
         )
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_scheduler_swapped = self._gauge_cls(
             name="vllm:num_requests_swapped",
-            documentation="Number of requests swapped to CPU.",
+            documentation=(
+                "Number of requests swapped to CPU. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
+
         #   KV Cache Usage in %
         self.gauge_gpu_cache_usage = self._gauge_cls(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames,
             multiprocess_mode="sum")
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_cpu_cache_usage = self._gauge_cls(
             name="vllm:cpu_cache_usage_perc",
-            documentation="CPU KV-cache usage. 1 means 100 percent usage.",
+            documentation=(
+                "CPU KV-cache usage. 1 means 100 percent usage. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
-        #   Prefix caching block hit rate
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
             name="vllm:cpu_prefix_cache_hit_rate",
-            documentation="CPU prefix cache block hit rate.",
+            documentation=(
+                "CPU prefix cache block hit rate. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
+
+        # Deprecated in 0.8 - replaced by queries+hits counters in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
             name="vllm:gpu_prefix_cache_hit_rate",
-            documentation="GPU prefix cache block hit rate.",
+            documentation=("GPU prefix cache block hit rate. "
+                           "DEPRECATED: use vllm:gpu_prefix_cache_queries and "
+                           "vllm:gpu_prefix_cache_queries in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")