mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 13:44:58 +08:00
[V1][Misc] stop update prefix cache stats when logs_stats is disabled (#16460)
Signed-off-by: vie-serendipity <2733147505@qq.com>
This commit is contained in:
parent
9d4ca19d50
commit
d9737ca1c6
@ -751,3 +751,25 @@ def test_reset_prefix_cache():
|
|||||||
assert manager.reset_prefix_cache()
|
assert manager.reset_prefix_cache()
|
||||||
assert not manager.block_pool.cached_block_hash_to_block
|
assert not manager.block_pool.cached_block_hash_to_block
|
||||||
assert all([blk.block_hash is None for blk in manager.block_pool.blocks])
|
assert all([blk.block_hash is None for blk in manager.block_pool.blocks])
|
||||||
|
|
||||||
|
|
||||||
|
def test_prefix_cache_stats_disabled():
|
||||||
|
"""Test that prefix_cache_stats is None when log_stats is False."""
|
||||||
|
manager = KVCacheManager(
|
||||||
|
make_kv_cache_config(16, 11),
|
||||||
|
max_model_len=8192,
|
||||||
|
enable_caching=True,
|
||||||
|
log_stats=False, # Disable logging stats
|
||||||
|
)
|
||||||
|
assert manager.prefix_cache_stats is None
|
||||||
|
|
||||||
|
# Call all functions that check whether log_stats is disabled.
|
||||||
|
req = make_request("0", list(range(16)))
|
||||||
|
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
|
||||||
|
assert not computed_blocks
|
||||||
|
assert num_computed_tokens == 0
|
||||||
|
manager.allocate_slots(req, 16, computed_blocks)
|
||||||
|
manager.reset_prefix_cache()
|
||||||
|
|
||||||
|
# Ensure prefix_cache_stats remains None
|
||||||
|
assert manager.prefix_cache_stats is None
|
||||||
|
|||||||
@ -39,8 +39,9 @@ class KVCacheManager:
|
|||||||
|
|
||||||
self.enable_caching = enable_caching
|
self.enable_caching = enable_caching
|
||||||
self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
|
self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
|
||||||
# FIXME: make prefix cache stats conditional on log_stats
|
|
||||||
self.log_stats = log_stats
|
self.log_stats = log_stats
|
||||||
|
# FIXME: make prefix cache stats conditional on log_stats
|
||||||
|
self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
|
||||||
# NOTE(woosuk): To avoid frequent block allocation, we preallocate some
|
# NOTE(woosuk): To avoid frequent block allocation, we preallocate some
|
||||||
# blocks for each request. For example, when a request reaches the end
|
# blocks for each request. For example, when a request reaches the end
|
||||||
# of its block table, we preallocate N blocks in advance. This way, we
|
# of its block table, we preallocate N blocks in advance. This way, we
|
||||||
@ -79,7 +80,6 @@ class KVCacheManager:
|
|||||||
# This is only used to track the RUNNING requests, we do not track the
|
# This is only used to track the RUNNING requests, we do not track the
|
||||||
# data for reempted ones.
|
# data for reempted ones.
|
||||||
self.num_cached_block: dict[str, int] = {}
|
self.num_cached_block: dict[str, int] = {}
|
||||||
self.prefix_cache_stats = PrefixCacheStats()
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def usage(self) -> float:
|
def usage(self) -> float:
|
||||||
@ -90,12 +90,14 @@ class KVCacheManager:
|
|||||||
"""
|
"""
|
||||||
return self.block_pool.get_usage()
|
return self.block_pool.get_usage()
|
||||||
|
|
||||||
def make_prefix_cache_stats(self) -> PrefixCacheStats:
|
def make_prefix_cache_stats(self) -> Optional[PrefixCacheStats]:
|
||||||
"""Get (and reset) the prefix cache stats.
|
"""Get (and reset) the prefix cache stats.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The current prefix caching stats.
|
The current prefix caching stats, or None if logging is disabled.
|
||||||
"""
|
"""
|
||||||
|
if not self.log_stats:
|
||||||
|
return None
|
||||||
stats = self.prefix_cache_stats
|
stats = self.prefix_cache_stats
|
||||||
self.prefix_cache_stats = PrefixCacheStats()
|
self.prefix_cache_stats = PrefixCacheStats()
|
||||||
return stats
|
return stats
|
||||||
@ -125,7 +127,9 @@ class KVCacheManager:
|
|||||||
self.block_size, request)
|
self.block_size, request)
|
||||||
self.req_to_block_hashes[request.request_id] = block_hashes
|
self.req_to_block_hashes[request.request_id] = block_hashes
|
||||||
|
|
||||||
self.prefix_cache_stats.requests += 1
|
if self.log_stats:
|
||||||
|
assert self.prefix_cache_stats is not None
|
||||||
|
self.prefix_cache_stats.requests += 1
|
||||||
# When the request requires prompt logprobs, we skip prefix caching.
|
# When the request requires prompt logprobs, we skip prefix caching.
|
||||||
if request.sampling_params.prompt_logprobs is not None:
|
if request.sampling_params.prompt_logprobs is not None:
|
||||||
return [], 0
|
return [], 0
|
||||||
@ -145,8 +149,10 @@ class KVCacheManager:
|
|||||||
|
|
||||||
computed_blocks = (
|
computed_blocks = (
|
||||||
self.specialized_manager.find_longest_cache_hit(block_hashes))
|
self.specialized_manager.find_longest_cache_hit(block_hashes))
|
||||||
self.prefix_cache_stats.queries += len(block_hashes)
|
if self.log_stats:
|
||||||
self.prefix_cache_stats.hits += len(computed_blocks)
|
assert self.prefix_cache_stats is not None
|
||||||
|
self.prefix_cache_stats.queries += len(block_hashes)
|
||||||
|
self.prefix_cache_stats.hits += len(computed_blocks)
|
||||||
|
|
||||||
if last_block_hash is not None:
|
if last_block_hash is not None:
|
||||||
# Add back the last block hash if it was removed.
|
# Add back the last block hash if it was removed.
|
||||||
@ -317,17 +323,19 @@ class KVCacheManager:
|
|||||||
|
|
||||||
def reset_prefix_cache(self) -> bool:
|
def reset_prefix_cache(self) -> bool:
|
||||||
"""Reset prefix cache. This function may be used in RLHF
|
"""Reset prefix cache. This function may be used in RLHF
|
||||||
flows to invalid prefix caching after the weights are updated,
|
flows to invalidate prefix caching after the weights are updated,
|
||||||
or used for resetting prefix caching status for benchmarking.
|
or used for resetting prefix caching status for benchmarking.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if the prefix cache is successfully reset,
|
bool: True if the prefix cache is successfully reset,
|
||||||
False otherwise.
|
False otherwise.
|
||||||
"""
|
"""
|
||||||
if self.block_pool.reset_prefix_cache():
|
if not self.block_pool.reset_prefix_cache():
|
||||||
|
return False
|
||||||
|
if self.log_stats:
|
||||||
|
assert self.prefix_cache_stats is not None
|
||||||
self.prefix_cache_stats.reset = True
|
self.prefix_cache_stats.reset = True
|
||||||
return True
|
return True
|
||||||
return False
|
|
||||||
|
|
||||||
def get_num_common_prefix_blocks(
|
def get_num_common_prefix_blocks(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -798,11 +798,13 @@ class Scheduler(SchedulerInterface):
|
|||||||
) -> Optional[SchedulerStats]:
|
) -> Optional[SchedulerStats]:
|
||||||
if not self.log_stats:
|
if not self.log_stats:
|
||||||
return None
|
return None
|
||||||
|
prefix_cache_stats = self.kv_cache_manager.make_prefix_cache_stats()
|
||||||
|
assert prefix_cache_stats is not None
|
||||||
return SchedulerStats(
|
return SchedulerStats(
|
||||||
num_running_reqs=len(self.running),
|
num_running_reqs=len(self.running),
|
||||||
num_waiting_reqs=len(self.waiting),
|
num_waiting_reqs=len(self.waiting),
|
||||||
gpu_cache_usage=self.kv_cache_manager.usage,
|
gpu_cache_usage=self.kv_cache_manager.usage,
|
||||||
prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
|
prefix_cache_stats=prefix_cache_stats,
|
||||||
spec_decoding_stats=spec_decoding_stats,
|
spec_decoding_stats=spec_decoding_stats,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user