[Core][Prefix Hash] Fix prefix hash metrics sliding window maintainance (#24990)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
This commit is contained in:
Jialin Ouyang 2025-09-19 11:22:53 -07:00 committed by GitHub
parent 47fd08aaf9
commit 2506ce5189
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 50 additions and 9 deletions

View File

@ -513,27 +513,27 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
assert block_hashes[1] == hash_fn((block_hashes[0], (3, 4, 5), None))
def _stats(requests: int, queries: int, hits: int) -> PrefixCacheStats:
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
def test_metrics():
"""
Test the prefix caching metrics.
"""
def stats(requests, queries, hits):
return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
metrics = PrefixCachingMetrics(max_recent_requests=5)
assert metrics.hit_rate == 0.0
metrics.observe(stats(1, 20, 9))
metrics.observe(_stats(1, 20, 9))
# 9 / 20 = 0.45
assert metrics.hit_rate == 0.45
metrics.observe(stats(4, 80, 16))
metrics.observe(_stats(4, 80, 16))
# 25 / 100 = 0.25
assert metrics.hit_rate == 0.25
metrics.observe(stats(1, 10, 2))
metrics.observe(_stats(1, 10, 2))
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
assert metrics.aggregated_requests == 5
@ -549,6 +549,38 @@ def test_metrics():
assert not metrics.query_queue
def test_metrics_empty_stats():
"""
Test the prefix caching metrics with empty stats.
"""
metrics = PrefixCachingMetrics(max_recent_requests=5)
metrics.observe(_stats(0, 0, 0))
metrics.observe(_stats(1, 20, 9))
metrics.observe(_stats(0, 0, 0))
metrics.observe(_stats(4, 80, 16))
metrics.observe(_stats(0, 0, 0))
metrics.observe(_stats(1, 10, 2))
# Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
assert metrics.aggregated_requests == 5
assert metrics.aggregated_query_total == 90
assert metrics.aggregated_query_hit == 18
assert metrics.hit_rate == 0.2
# Only the latest added stats preserved 10 / 20 = 0.5
metrics.observe(_stats(11, 20, 10))
assert metrics.aggregated_requests == 11
assert metrics.aggregated_query_total == 20
assert metrics.aggregated_query_hit == 10
assert metrics.hit_rate == 0.5
# Only the latest added stats preserved 30 / 40 = 0.75
metrics.observe(_stats(22, 40, 30))
assert metrics.aggregated_requests == 22
assert metrics.aggregated_query_total == 40
assert metrics.aggregated_query_hit == 30
assert metrics.hit_rate == 0.75
def test_get_kv_cache_configs_multiple_workers():
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(model_config=model_config)

View File

@ -127,14 +127,23 @@ class PrefixCachingMetrics:
if stats.reset:
self.reset()
# DO NOT appending empty stats to avoid helpful info get kicked out
# due to sliding window.
if stats.requests == 0:
return
# Update the metrics.
self.query_queue.append((stats.requests, stats.queries, stats.hits))
self.aggregated_requests += stats.requests
self.aggregated_query_total += stats.queries
self.aggregated_query_hit += stats.hits
# Remove the oldest stats if the number of requests exceeds.
if self.aggregated_requests > self.max_recent_requests:
# Remove the oldest stats until number of requests does not exceed
# the limit.
# NOTE: We preserve the latest added stats regardless.
while len(
self.query_queue
) > 1 and self.aggregated_requests > self.max_recent_requests:
old_requests, old_queries, old_hits = self.query_queue.popleft()
self.aggregated_requests -= old_requests
self.aggregated_query_total -= old_queries