[v1][KVCacheManager] Change prefix caching metric from counting blocks to counting tokens (#18003)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2026-03-17 06:37:07 +08:00 · 2025-05-13 04:46:12 +08:00 · 2025-05-13 04:46:12 +08:00 · 302f3aca7e
commit 302f3aca7e
parent e9c730c9bd
3 changed files with 9 additions and 9 deletions
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@ -161,11 +161,15 @@ class KVCacheManager:

        computed_blocks = (
            self.single_type_manager.find_longest_cache_hit(block_hashes))
+        # NOTE(woosuk): Since incomplete blocks are not eligible for
+        # sharing, `num_computed_tokens` is always a multiple of
+        # `block_size`.
+        num_computed_tokens = len(computed_blocks) * self.block_size

        if self.log_stats:
            assert self.prefix_cache_stats is not None
-            self.prefix_cache_stats.queries += len(block_hashes)
-            self.prefix_cache_stats.hits += len(computed_blocks)
+            self.prefix_cache_stats.queries += request.num_tokens
+            self.prefix_cache_stats.hits += num_computed_tokens

        if last_block_hash is not None:
            # Add back the last block hash if it was removed.
@ -173,10 +177,6 @@ class KVCacheManager:
            # we shouldn't modify it directly.
            block_hashes.append(last_block_hash)

-        # NOTE(woosuk): Since incomplete blocks are not eligible for
-        # sharing, `num_computed_tokens` is always a multiple of
-        # `block_size`.
-        num_computed_tokens = len(computed_blocks) * self.block_size
        return KVCacheBlocks(computed_blocks), num_computed_tokens

    def allocate_slots(
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@ -183,13 +183,13 @@ class PrometheusStatLogger(StatLoggerBase):
        self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
            name="vllm:gpu_prefix_cache_queries",
            documentation=
-            "GPU prefix cache queries, in terms of number of queried blocks.",
+            "GPU prefix cache queries, in terms of number of queried tokens.",
            labelnames=labelnames).labels(*labelvalues)

        self.counter_gpu_prefix_cache_hits = prometheus_client.Counter(
            name="vllm:gpu_prefix_cache_hits",
            documentation=
-            "GPU prefix cache hits, in terms of number of cached blocks.",
+            "GPU prefix cache hits, in terms of number of cached tokens.",
            labelnames=labelnames).labels(*labelvalues)

        #
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@ -19,7 +19,7 @@ class PrefixCacheStats:
    # The number of requests in this update.
    requests: int = 0
    # The number of queries in these requests. Note that "queries" here
-    # means the number of blocks that were queried from the cache.
+    # means the number of tokens that were queried from the cache.
    queries: int = 0
    # The number of hits in these requests.
    hits: int = 0