mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 17:25:38 +08:00
[V1] Add API docs for EncoderCacheManager (#19294)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
parent
d4629dc43f
commit
5f52a84685
@ -14,6 +14,39 @@ logger = init_logger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class EncoderCacheManager:
|
class EncoderCacheManager:
|
||||||
|
"""Manages caching of encoder outputs for multimodal models in vLLM V1.
|
||||||
|
|
||||||
|
The EncoderCacheManager handles the lifecycle of multimodal encoder outputs
|
||||||
|
(such as vision embeddings from images) during request processing. It
|
||||||
|
provides memory-aware caching to avoid recomputing encoder outputs when the
|
||||||
|
same multimodal inputs appear in different stages of request processing.
|
||||||
|
|
||||||
|
This manager is particularly important for:
|
||||||
|
- Vision-language models (e.g., LLaVA) where image encoder outputs are
|
||||||
|
cached
|
||||||
|
- Any multimodal model where encoder computation is expensive and
|
||||||
|
cacheable
|
||||||
|
|
||||||
|
The cache operates at the granularity of individual multimodal input items
|
||||||
|
within requests, allowing for fine-grained memory management and enabling
|
||||||
|
chunked processing of multimodal inputs.
|
||||||
|
|
||||||
|
Note that no caching is shared between requests at this time. If the same
|
||||||
|
input is used across multiple requests, it will be reprocessed for each
|
||||||
|
request.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cache_size: Limit the size of the cache, measured by the number of
|
||||||
|
tokens from the input sequence.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
cache_size: Total cache capacity in encoder tokens
|
||||||
|
num_free_slots: Current available cache capacity in encoder tokens
|
||||||
|
cached: Mapping from request_id to set of cached input_ids for that
|
||||||
|
request
|
||||||
|
freed: List of (request_id, input_id) pairs that were recently freed.
|
||||||
|
This is cleared after every call to get_freed_ids().
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, cache_size: int):
|
def __init__(self, cache_size: int):
|
||||||
self.cache_size = cache_size
|
self.cache_size = cache_size
|
||||||
@ -24,14 +57,48 @@ class EncoderCacheManager:
|
|||||||
self.freed: list[tuple[str, int]] = []
|
self.freed: list[tuple[str, int]] = []
|
||||||
|
|
||||||
def has_cache(self, request: Request, input_id: int) -> bool:
|
def has_cache(self, request: Request, input_id: int) -> bool:
|
||||||
|
"""Check if encoder output for a specific multimodal input is cached.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: The request containing the multimodal input
|
||||||
|
input_id: Index of the multimodal input within the request
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the encoder output for this input is already cached
|
||||||
|
"""
|
||||||
req_id = request.request_id
|
req_id = request.request_id
|
||||||
return req_id in self.cached and input_id in self.cached[req_id]
|
return req_id in self.cached and input_id in self.cached[req_id]
|
||||||
|
|
||||||
def can_allocate(self, request: Request, input_id: int) -> bool:
|
def can_allocate(self, request: Request, input_id: int) -> bool:
|
||||||
|
"""Check if there's sufficient cache space for a multimodal input.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: The request containing the multimodal input
|
||||||
|
input_id: Index of the multimodal input within the request
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if there's enough free cache space to store the encoder output
|
||||||
|
for this multimodal input
|
||||||
|
"""
|
||||||
num_tokens = request.get_num_encoder_tokens(input_id)
|
num_tokens = request.get_num_encoder_tokens(input_id)
|
||||||
return num_tokens <= self.num_free_slots
|
return num_tokens <= self.num_free_slots
|
||||||
|
|
||||||
def allocate(self, request: Request, input_id: int) -> None:
|
def allocate(self, request: Request, input_id: int) -> None:
|
||||||
|
"""Allocate cache space for a multimodal input's encoder output.
|
||||||
|
|
||||||
|
This method reserves cache space for storing the encoder output of
|
||||||
|
the specified multimodal input. The actual encoder output storage
|
||||||
|
happens in the model runner, but this method ensures the cache
|
||||||
|
manager tracks the allocation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: The request containing the multimodal input
|
||||||
|
input_id: Index of the multimodal input within the request
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This method assumes can_allocate() returned True for the same
|
||||||
|
request and input_id. It will reduce available cache space.
|
||||||
|
"""
|
||||||
req_id = request.request_id
|
req_id = request.request_id
|
||||||
if req_id not in self.cached:
|
if req_id not in self.cached:
|
||||||
self.cached[req_id] = set()
|
self.cached[req_id] = set()
|
||||||
@ -39,10 +106,30 @@ class EncoderCacheManager:
|
|||||||
self.num_free_slots -= request.get_num_encoder_tokens(input_id)
|
self.num_free_slots -= request.get_num_encoder_tokens(input_id)
|
||||||
|
|
||||||
def get_cached_input_ids(self, request: Request) -> set[int]:
|
def get_cached_input_ids(self, request: Request) -> set[int]:
|
||||||
|
"""Get all cached multimodal input IDs for a request.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: The request to query
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Set of input_ids that have cached encoder outputs for this request.
|
||||||
|
Returns empty set if no inputs are cached for this request.
|
||||||
|
"""
|
||||||
return self.cached.get(request.request_id, set())
|
return self.cached.get(request.request_id, set())
|
||||||
|
|
||||||
def free_encoder_input(self, request: Request, input_id: int) -> None:
|
def free_encoder_input(self, request: Request, input_id: int) -> None:
|
||||||
"""Free a single encoder input id for the request."""
|
"""Free cache space for a single multimodal input's encoder output.
|
||||||
|
|
||||||
|
This method is called when:
|
||||||
|
- The encoder output has been fully consumed by the decoder and is
|
||||||
|
no longer needed (e.g., in vision-language models after image
|
||||||
|
tokens are processed)
|
||||||
|
- A request is being cancelled or aborted
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: The request containing the multimodal input
|
||||||
|
input_id: Index of the multimodal input to free from cache
|
||||||
|
"""
|
||||||
req_id = request.request_id
|
req_id = request.request_id
|
||||||
if req_id not in self.cached:
|
if req_id not in self.cached:
|
||||||
return
|
return
|
||||||
@ -54,12 +141,29 @@ class EncoderCacheManager:
|
|||||||
self.freed.append((req_id, input_id))
|
self.freed.append((req_id, input_id))
|
||||||
|
|
||||||
def free(self, request: Request) -> None:
|
def free(self, request: Request) -> None:
|
||||||
"""Free all cached input ids for the request."""
|
"""Free all cached encoder outputs for a request.
|
||||||
|
|
||||||
|
This method is typically called when a request is finished, cancelled,
|
||||||
|
or aborted, and all its encoder outputs should be freed from cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: The request whose encoder outputs should be freed
|
||||||
|
"""
|
||||||
input_ids = self.get_cached_input_ids(request).copy()
|
input_ids = self.get_cached_input_ids(request).copy()
|
||||||
for input_id in input_ids:
|
for input_id in input_ids:
|
||||||
self.free_encoder_input(request, input_id)
|
self.free_encoder_input(request, input_id)
|
||||||
|
|
||||||
def get_freed_ids(self) -> list[tuple[str, int]]:
|
def get_freed_ids(self) -> list[tuple[str, int]]:
|
||||||
|
"""Get and clear the list of recently freed encoder cache entries.
|
||||||
|
|
||||||
|
This method returns all encoder cache entries that were freed since
|
||||||
|
the last call to this method. It's used by the scheduler to notify
|
||||||
|
workers about which encoder outputs can be removed from their caches.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (request_id, input_id) tuples that were freed since the
|
||||||
|
last call. The internal freed list is cleared after this call.
|
||||||
|
"""
|
||||||
freed = self.freed
|
freed = self.freed
|
||||||
self.freed = []
|
self.freed = []
|
||||||
return freed
|
return freed
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user