mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-05 20:32:21 +08:00
[v1][KVCacheManager] Change prefix caching metric from counting blocks to counting tokens (#18003)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
parent
e9c730c9bd
commit
302f3aca7e
@ -161,11 +161,15 @@ class KVCacheManager:
|
|||||||
|
|
||||||
computed_blocks = (
|
computed_blocks = (
|
||||||
self.single_type_manager.find_longest_cache_hit(block_hashes))
|
self.single_type_manager.find_longest_cache_hit(block_hashes))
|
||||||
|
# NOTE(woosuk): Since incomplete blocks are not eligible for
|
||||||
|
# sharing, `num_computed_tokens` is always a multiple of
|
||||||
|
# `block_size`.
|
||||||
|
num_computed_tokens = len(computed_blocks) * self.block_size
|
||||||
|
|
||||||
if self.log_stats:
|
if self.log_stats:
|
||||||
assert self.prefix_cache_stats is not None
|
assert self.prefix_cache_stats is not None
|
||||||
self.prefix_cache_stats.queries += len(block_hashes)
|
self.prefix_cache_stats.queries += request.num_tokens
|
||||||
self.prefix_cache_stats.hits += len(computed_blocks)
|
self.prefix_cache_stats.hits += num_computed_tokens
|
||||||
|
|
||||||
if last_block_hash is not None:
|
if last_block_hash is not None:
|
||||||
# Add back the last block hash if it was removed.
|
# Add back the last block hash if it was removed.
|
||||||
@ -173,10 +177,6 @@ class KVCacheManager:
|
|||||||
# we shouldn't modify it directly.
|
# we shouldn't modify it directly.
|
||||||
block_hashes.append(last_block_hash)
|
block_hashes.append(last_block_hash)
|
||||||
|
|
||||||
# NOTE(woosuk): Since incomplete blocks are not eligible for
|
|
||||||
# sharing, `num_computed_tokens` is always a multiple of
|
|
||||||
# `block_size`.
|
|
||||||
num_computed_tokens = len(computed_blocks) * self.block_size
|
|
||||||
return KVCacheBlocks(computed_blocks), num_computed_tokens
|
return KVCacheBlocks(computed_blocks), num_computed_tokens
|
||||||
|
|
||||||
def allocate_slots(
|
def allocate_slots(
|
||||||
|
|||||||
@ -183,13 +183,13 @@ class PrometheusStatLogger(StatLoggerBase):
|
|||||||
self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
|
self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
|
||||||
name="vllm:gpu_prefix_cache_queries",
|
name="vllm:gpu_prefix_cache_queries",
|
||||||
documentation=
|
documentation=
|
||||||
"GPU prefix cache queries, in terms of number of queried blocks.",
|
"GPU prefix cache queries, in terms of number of queried tokens.",
|
||||||
labelnames=labelnames).labels(*labelvalues)
|
labelnames=labelnames).labels(*labelvalues)
|
||||||
|
|
||||||
self.counter_gpu_prefix_cache_hits = prometheus_client.Counter(
|
self.counter_gpu_prefix_cache_hits = prometheus_client.Counter(
|
||||||
name="vllm:gpu_prefix_cache_hits",
|
name="vllm:gpu_prefix_cache_hits",
|
||||||
documentation=
|
documentation=
|
||||||
"GPU prefix cache hits, in terms of number of cached blocks.",
|
"GPU prefix cache hits, in terms of number of cached tokens.",
|
||||||
labelnames=labelnames).labels(*labelvalues)
|
labelnames=labelnames).labels(*labelvalues)
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|||||||
@ -19,7 +19,7 @@ class PrefixCacheStats:
|
|||||||
# The number of requests in this update.
|
# The number of requests in this update.
|
||||||
requests: int = 0
|
requests: int = 0
|
||||||
# The number of queries in these requests. Note that "queries" here
|
# The number of queries in these requests. Note that "queries" here
|
||||||
# means the number of blocks that were queried from the cache.
|
# means the number of tokens that were queried from the cache.
|
||||||
queries: int = 0
|
queries: int = 0
|
||||||
# The number of hits in these requests.
|
# The number of hits in these requests.
|
||||||
hits: int = 0
|
hits: int = 0
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user