diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index de2333901cc91..8c1bb1a897e37 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -203,6 +203,8 @@ EXPECTED_METRICS_V1 = [ "vllm:num_requests_running", "vllm:num_requests_waiting", "vllm:gpu_cache_usage_perc", + "vllm:gpu_prefix_cache_queries", + "vllm:gpu_prefix_cache_hits", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", "vllm:request_success_total", diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 8df4cbe1be71b..ba08b83ec54e5 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -5,10 +5,11 @@ import pytest from vllm.multimodal.inputs import MultiModalKwargs from vllm.sampling_params import SamplingParams from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, - KVCacheBlock, + KVCacheBlock, PrefixCachingMetrics, generate_block_hash_extra_keys, hash_block_tokens, hash_request_tokens) +from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -277,3 +278,39 @@ def test_hash_request_tokens_no_mm_inputs(): assert block_hashes[0].extra_keys is None assert block_hashes[1].token_ids == (3, 4, 5) assert block_hashes[1].extra_keys is None + + +def test_metrics(): + """ + Test the prefix caching metrics. + """ + + def stats(requests, queries, hits): + return PrefixCacheStats(requests=requests, queries=queries, hits=hits) + + metrics = PrefixCachingMetrics(interval=5) + assert metrics.hit_rate == 0.0 + + metrics.observe(stats(1, 20, 9)) + # 9 / 20 = 0.45 + assert metrics.hit_rate == 0.45 + + metrics.observe(stats(4, 80, 16)) + + # 25 / 100 = 0.25 + assert metrics.hit_rate == 0.25 + + metrics.observe(stats(1, 10, 2)) + + # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2 + assert metrics.aggregated_requests == 5 + assert metrics.aggregated_query_total == 90 + assert metrics.aggregated_query_hit == 18 + assert metrics.hit_rate == 0.2 + + metrics.reset() + assert metrics.hit_rate == 0.0 + assert metrics.aggregated_requests == 0 + assert metrics.aggregated_query_total == 0 + assert metrics.aggregated_query_hit == 0 + assert not metrics.query_queue diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index f8d08d0e40236..f75d31f542cf7 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -10,6 +10,7 @@ from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, generate_block_hash_extra_keys, hash_block_tokens, hash_request_tokens) +from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request, RequestStatus logger = init_logger(__name__) @@ -78,11 +79,28 @@ class KVCacheManager: self.req_to_block_hashes: DefaultDict[ str, List[BlockHashType]] = defaultdict(list) + self.prefix_cache_stats = PrefixCacheStats() + @property def usage(self) -> float: + """Get the KV cache usage. + + Returns: + The KV cache usage (between 0.0 and 1.0). + """ return 1.0 - (self.free_block_queue.num_free_blocks / self.num_gpu_blocks) + def make_prefix_cache_stats(self) -> PrefixCacheStats: + """Get (and reset) the prefix cache stats. + + Returns: + The current prefix caching stats. + """ + stats = self.prefix_cache_stats + self.prefix_cache_stats = PrefixCacheStats() + return stats + def get_computed_blocks( self, request: Request) -> Tuple[List[KVCacheBlock], int]: """Get the computed (cached) blocks for the request. @@ -118,6 +136,10 @@ class KVCacheManager: else: break + self.prefix_cache_stats.requests += 1 + self.prefix_cache_stats.queries += len(block_hashes) + self.prefix_cache_stats.hits += len(computed_blocks) + # NOTE(woosuk): Since incomplete blocks are not eligible for # sharing, `num_computed_tokens` is always a multiple of # `block_size`. @@ -280,6 +302,8 @@ class KVCacheManager: for block in self.block_pool: block.reset_hash() + self.prefix_cache_stats.reset = True + logger.info("Successfully reset prefix cache") return True diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 6888f1a3e1823..bddb482d29167 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """KV-Cache Utilities.""" +from collections import deque from collections.abc import Sequence from dataclasses import dataclass from typing import Any, List, NamedTuple, Optional, Tuple @@ -8,6 +9,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec, KVCacheTensor) +from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request logger = init_logger(__name__) @@ -28,6 +30,68 @@ class BlockHashType(NamedTuple): extra_keys: Optional[Any] = None +class PrefixCachingMetrics: + """Metrics for prefix caching with a hit rate of the most recent N requests. + + Args: + interval: The number of the most recent requests to aggregate. + Defaults to 1000. + """ + + def __init__(self, interval: int = 1000): + self.interval = interval + # The current aggregated values. + self.aggregated_requests = 0 + self.aggregated_query_total = 0 + self.aggregated_query_hit = 0 + # A deque of (requests, queries, hits) for the most recent requests. + self.query_queue: deque[Tuple[int, int, int]] = deque() + + def observe(self, stats: PrefixCacheStats): + """Observe the prefix caching for a set of requests. + + This function is called with information gathered when new requests + are being scheduled and are looking for computed blocks. + + When there are more than `interval` requests, the oldest set of + requestsare removed from the metrics. + + Args: + stats: The prefix cache stats. + """ + # reset_prefix_cache was invoked before the current update. + # Reset the metrics before aggregating the current stats. + if stats.reset: + self.reset() + + # Update the metrics. + self.query_queue.append((stats.requests, stats.queries, stats.hits)) + self.aggregated_requests += stats.requests + self.aggregated_query_total += stats.queries + self.aggregated_query_hit += stats.hits + + # Remove the oldest stats if the number of requests exceeds. + if self.aggregated_requests > self.interval: + old_requests, old_queries, old_hits = self.query_queue.popleft() + self.aggregated_requests -= old_requests + self.aggregated_query_total -= old_queries + self.aggregated_query_hit -= old_hits + + def reset(self): + """Reset the metrics.""" + self.aggregated_requests = 0 + self.aggregated_query_total = 0 + self.aggregated_query_hit = 0 + self.query_queue.clear() + + @property + def hit_rate(self) -> float: + """Calculate the hit rate for the past N requests.""" + if self.aggregated_query_total == 0: + return 0.0 + return self.aggregated_query_hit / self.aggregated_query_total + + @dataclass class KVCacheBlock: """KV-cache block metadata.""" diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 1c54914d182ba..985fcf01bb216 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -593,4 +593,5 @@ class Scheduler: num_running_reqs=len(self.running), num_waiting_reqs=len(self.waiting), gpu_cache_usage=self.kv_cache_manager.usage, + prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(), ) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index eb1acf584c6b0..3472761dc1808 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -9,6 +9,7 @@ import prometheus_client from vllm.config import ModelConfig from vllm.logger import init_logger +from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics from vllm.v1.engine import FinishReason from vllm.v1.metrics.stats import IterationStats, SchedulerStats @@ -37,6 +38,9 @@ class LoggingStatLogger(StatLoggerBase): self.num_prompt_tokens: List[int] = [] self.num_generation_tokens: List[int] = [] + # Prefix cache metrics. TODO: Make the interval configurable. + self.prefix_caching_metrics = PrefixCachingMetrics() + def _local_interval_elapsed(self, now: float) -> bool: # Log every _LOCAL_LOGGING_INTERVAL_SEC. elapsed_time = now - self.last_log_time @@ -58,6 +62,8 @@ class LoggingStatLogger(StatLoggerBase): self._track_iteration_stats(iteration_stats) + self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats) + now = time.monotonic() if not self._local_interval_elapsed(now): return @@ -72,13 +78,15 @@ class LoggingStatLogger(StatLoggerBase): logger.info( "Avg prompt throughput: %.1f tokens/s, " "Avg generation throughput: %.1f tokens/s, " - "Running: %d reqs, Waiting: %d reqs " - "GPU KV cache usage: %.1f%%.", + "Running: %d reqs, Waiting: %d reqs, " + "GPU KV cache usage: %.1f%%, " + "Prefix cache hit rate: %.1f%%", prompt_throughput, generation_throughput, scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, scheduler_stats.gpu_cache_usage * 100, + self.prefix_caching_metrics.hit_rate * 100, ) @@ -107,6 +115,18 @@ class PrometheusStatLogger(StatLoggerBase): documentation="GPU KV-cache usage. 1 means 100 percent usage.", labelnames=labelnames).labels(*labelvalues) + self.counter_gpu_prefix_cache_queries = prometheus_client.Counter( + name="vllm:gpu_prefix_cache_queries", + documentation= + "GPU prefix cache queries, in terms of number of queried blocks.", + labelnames=labelnames).labels(*labelvalues) + + self.counter_gpu_prefix_cache_hits = prometheus_client.Counter( + name="vllm:gpu_prefix_cache_hits", + documentation= + "GPU prefix cache hits, in terms of number of cached blocks.", + labelnames=labelnames).labels(*labelvalues) + self.counter_prompt_tokens = prometheus_client.Counter( name="vllm:prompt_tokens_total", documentation="Number of prefill tokens processed.", @@ -170,6 +190,11 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) + self.counter_gpu_prefix_cache_queries.inc( + scheduler_stats.prefix_cache_stats.queries) + self.counter_gpu_prefix_cache_hits.inc( + scheduler_stats.prefix_cache_stats.hits) + self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) self.counter_generation_tokens.inc( iteration_stats.num_generation_tokens) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 5e588d35ea4d7..f806b0adf5d5a 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import time -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import TYPE_CHECKING, List if TYPE_CHECKING: @@ -9,6 +9,20 @@ if TYPE_CHECKING: from vllm.v1.engine import EngineCoreOutput, FinishReason +@dataclass +class PrefixCacheStats: + """Stores prefix cache hit statistics.""" + # Whether reset_prefix_cache was invoked. + reset: bool = False + # The number of requests in this update. + requests: int = 0 + # The number of queries in these requests. Note that "queries" here + # means the number of blocks that were queried from the cache. + queries: int = 0 + # The number of hits in these requests. + hits: int = 0 + + @dataclass class SchedulerStats: """Stats associated with the scheduler.""" @@ -17,7 +31,9 @@ class SchedulerStats: num_waiting_reqs: int = 0 gpu_cache_usage: float = 0.0 - # gpu_prefix_cache_hit_rate: float = 0.0 + + prefix_cache_stats: PrefixCacheStats = field( + default_factory=PrefixCacheStats) @dataclass