mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-20 10:17:00 +08:00
Introduces three new Prometheus histograms for fine-grained observability of KV cache residency behavior: vllm:kv_block_lifetime_seconds — total lifetime from allocation to free vllm:kv_block_idle_before_evict_seconds — idle duration before eviction vllm:kv_block_reuse_gap_seconds — time between consecutive reuses of the same block These metrics help operators analyze KV cache efficiency, reuse patterns, and eviction timing beyond simple utilization rates. Implementation uses monotonic timestamps for accuracy, 1% sampling for minimal overhead (~48 bytes/block), and is fully thread-safe with zero runtime cost when disabled. Two new runtime flags are introduced: --kv-cache-metrics – enable KV cache residency metrics --kv-cache-metrics-sample – control sampling ratio (default: 0.01) Signed-off-by: Shivam <shivamprasad91@gmail.com>
97 lines
3.1 KiB
Python
97 lines
3.1 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""KV cache metrics tracking."""
|
|
|
|
import random
|
|
import time
|
|
from collections import deque
|
|
from typing import TYPE_CHECKING
|
|
|
|
if TYPE_CHECKING:
|
|
from vllm.v1.core.kv_cache_utils import KVCacheBlock
|
|
|
|
from vllm.v1.metrics.stats import KVCacheEvictionEvent
|
|
|
|
|
|
class BlockMetricsState:
|
|
"""Tracks lifecycle metrics for a single KV cache block."""
|
|
|
|
def __init__(self):
|
|
now_ns = time.monotonic_ns()
|
|
self.birth_time_ns = now_ns
|
|
self.last_access_ns = now_ns
|
|
# Bounded to prevent unbounded growth if a block is accessed many times.
|
|
self.access_history: deque[int] = deque(maxlen=4)
|
|
|
|
def record_access(self) -> None:
|
|
now_ns = time.monotonic_ns()
|
|
self.last_access_ns = now_ns
|
|
self.access_history.append(now_ns)
|
|
|
|
def get_lifetime_seconds(self) -> float:
|
|
now_ns = time.monotonic_ns()
|
|
return (now_ns - self.birth_time_ns) / 1e9
|
|
|
|
def get_idle_time_seconds(self) -> float:
|
|
now_ns = time.monotonic_ns()
|
|
return (now_ns - self.last_access_ns) / 1e9
|
|
|
|
def get_reuse_gaps_seconds(self) -> list[float]:
|
|
if len(self.access_history) < 2:
|
|
return []
|
|
history = list(self.access_history)
|
|
return [(history[i] - history[i - 1]) / 1e9 for i in range(1, len(history))]
|
|
|
|
|
|
class KVCacheMetricsCollector:
|
|
"""Collects KV cache residency metrics with sampling."""
|
|
|
|
def __init__(self, sample_rate: float = 0.01):
|
|
assert 0 < sample_rate <= 1.0, (
|
|
f"sample_rate must be in (0, 1.0], got {sample_rate}"
|
|
)
|
|
self.sample_rate = sample_rate
|
|
|
|
self.block_metrics: dict[int, BlockMetricsState] = {}
|
|
|
|
self._eviction_events: list[KVCacheEvictionEvent] = []
|
|
|
|
def should_sample_block(self) -> bool:
|
|
return random.random() < self.sample_rate
|
|
|
|
def on_block_allocated(self, block: "KVCacheBlock") -> None:
|
|
if self.should_sample_block():
|
|
self.block_metrics[block.block_id] = BlockMetricsState()
|
|
|
|
def on_block_accessed(self, block: "KVCacheBlock") -> None:
|
|
metrics = self.block_metrics.get(block.block_id)
|
|
if metrics:
|
|
metrics.record_access()
|
|
|
|
def on_block_evicted(self, block: "KVCacheBlock") -> None:
|
|
metrics = self.block_metrics.pop(block.block_id, None)
|
|
if not metrics:
|
|
return
|
|
|
|
lifetime = metrics.get_lifetime_seconds()
|
|
idle_time = metrics.get_idle_time_seconds()
|
|
reuse_gaps = tuple(metrics.get_reuse_gaps_seconds())
|
|
|
|
self._eviction_events.append(
|
|
KVCacheEvictionEvent(
|
|
lifetime_seconds=lifetime,
|
|
idle_seconds=idle_time,
|
|
reuse_gaps_seconds=reuse_gaps,
|
|
)
|
|
)
|
|
|
|
def reset(self) -> None:
|
|
"""Clear all state on cache reset."""
|
|
self.block_metrics.clear()
|
|
self._eviction_events.clear()
|
|
|
|
def drain_events(self) -> list[KVCacheEvictionEvent]:
|
|
events = self._eviction_events
|
|
self._eviction_events = []
|
|
return events
|