[Bugfix] Missing cached item in the MultiModalReceiverCache (#28525)

Signed-off-by: knlnguyen1802 <knlnguyen1802@gmail.com> Co-authored-by: Chenguang Zheng <645327136@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-12-23 04:15:01 +08:00 · 2025-12-02 02:18:07 +08:00 · 2025-12-02 02:18:07 +08:00 · fc6acc88ca
commit fc6acc88ca
parent d0985c5feb
5 changed files with 436 additions and 21 deletions
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import multiprocessing as mp
 import numpy as np
 import pytest
@ -8,9 +9,16 @@ import torch
 from vllm.config import ModelConfig, ParallelConfig, VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import (
    BaseMultiModalProcessorCache,
    BaseMultiModalReceiverCache,
    MultiModalCache,
    MultiModalProcessorCacheInItem,
    MultiModalProcessorCacheItem,
    MultiModalProcessorCacheItemMetadata,
    MultiModalProcessorSenderCache,
    MultiModalReceiverCache,
    ShmObjectStoreReceiverCache,
    ShmObjectStoreSenderCache,
    engine_receiver_cache_from_config,
    processor_cache_from_config,
 )
@ -22,6 +30,7 @@ from vllm.multimodal.inputs import (
    MultiModalSharedField,
 )
 from vllm.multimodal.processing import PromptInsertion
 from vllm.utils.mem_constants import GiB_bytes, MiB_bytes
 pytestmark = pytest.mark.cpu_test
@ -144,8 +153,7 @@ def _compare_caches(
        MultiModalHasher.hash_kwargs(item=item.get_data()) for item in all_items
    ]
-    # Should not be used since there is nothing to convert to text
+    prompt_update = PromptInsertion("dummy", "target", "insertion").resolve(0)
    prompt_update = PromptInsertion("dummy", "target", "insertion")
    for it in range(n_iter):
        num_items_to_select = rng.randint(0, max_items_per_iter)
@ -159,10 +167,11 @@ def _compare_caches(
        else:
            for _ in range(is_cached_calls_per_iter):
                cache_0_p0.is_cached(selected_hashes)
            cache_0_p0_out = [
                item
                for item, _ in cache_0_p0.get_and_update(
-                    [(item, prompt_update.content) for item in selected_items],
+                    [(item, [prompt_update]) for item in selected_items],
                    selected_hashes,
                )
            ]
@ -172,10 +181,11 @@ def _compare_caches(
        else:
            for _ in range(is_cached_calls_per_iter):
                cache_1_p0.is_cached(selected_hashes)
            cache_1_p0_out = [
                item
                for item, _ in cache_1_p0.get_and_update(
-                    [(item, prompt_update.content) for item in selected_items],
+                    [(item, [prompt_update]) for item in selected_items],
                    selected_hashes,
                )
            ]
@ -225,3 +235,289 @@ def test_ipc_enable_disable_consistency(is_cached_calls_per_iter):
        vllm_config_ipc_enabled,
        is_cached_calls_per_iter=is_cached_calls_per_iter,
    )
 def _run_test_cache_eviction_lru(
    p0_cache: BaseMultiModalProcessorCache,
    p1_cache: BaseMultiModalReceiverCache,
    base_item_size: int,
 ):
    request1_hashes = [
        "image_A",
        "image_B",
        "image_C",
    ]
    request1_items = {
        h: MultiModalKwargsItem.dummy(h, nbytes=2 * base_item_size)
        for h in request1_hashes
    }
    request2_hashes = ["image_D", "image_E", "image_A", "image_C"]
    request2_items = {
        h: MultiModalKwargsItem.dummy(h, nbytes=1 * base_item_size)
        for h in request2_hashes
    }
    ##########################
    # STEP 1: Request 1 send
    ##########################
    sender_is_cached_item_req1 = p0_cache.is_cached(request1_hashes)
    # Cache is empty
    assert sender_is_cached_item_req1 == [False, False, False]
    # Touch all mm hash for P0 Cache before process
    for mm_hash in request1_hashes:
        p0_cache.touch_sender_cache_item(mm_hash)
    ###########################
    # Process request 1 for P0 Cache
    ###########################
    item_tuple: MultiModalProcessorCacheInItem
    for i, h in enumerate(request1_hashes):
        # Use precomputed cache state
        is_cached = sender_is_cached_item_req1[i]
        item_tuple = (request1_items[h], []) if not is_cached else None
        print(f"Request 1: key={h} | cached={is_cached}")
        p0_cache.get_and_update_item(item_tuple, h)
    ###########################
    # Process request 1 for P1 Cache
    ###########################
    # Touch all mm hash for P1 Cache before process
    for mm_hash in request1_hashes:
        p1_cache.touch_receiver_cache_item(mm_hash)
    for h in request1_hashes:
        p1_cache.get_and_update_item(request1_items[h], h)
    expected_hashes = ["image_A", "image_B", "image_C"]
    assert list(p0_cache._cache.order) == expected_hashes
    ##########################
    # STEP 2: Request 2 send
    ##########################
    sender_is_cached_item_req2 = p0_cache.is_cached(request2_hashes)
    assert sender_is_cached_item_req2 == [False, False, True, True]
    # Touch all mm hash for P0 Cache before process
    for mm_hash in request2_hashes:
        p0_cache.touch_sender_cache_item(mm_hash)
    ###########################
    # Process request 2 for P0 Cache
    ###########################
    for i, h in enumerate(request2_hashes):
        # Use precomputed cache state again
        is_cached = sender_is_cached_item_req2[i]
        item_tuple = (request2_items[h], []) if not is_cached else None
        print(f"Request 2: key={h} | cached={is_cached}")
        p0_cache.get_and_update_item(item_tuple, h)
    ###########################
    # Process request 2 for P1 Cache
    ###########################
    # Touch all mm hash for P1 Cache before process
    for mm_hash in request2_hashes:
        p1_cache.touch_receiver_cache_item(mm_hash)
    for h in request2_hashes:
        p1_cache.get_and_update_item(request2_items[h], h)
    expected_hashes = ["image_D", "image_E", "image_A", "image_C"]
    assert list(p0_cache._cache.order) == expected_hashes
 def test_cache_eviction_lru_cache():
    model_config = ModelConfig(
        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
        mm_processor_cache_gb=6 / GiB_bytes,
    )
    sender_cache = MultiModalProcessorSenderCache(model_config)
    receiver_cache = MultiModalReceiverCache(model_config)
    _run_test_cache_eviction_lru(sender_cache, receiver_cache, base_item_size=1)
 # This test verifies shared-memory cache eviction behavior across processor (p0)
 # and receiver (p1) caches.
 # Flow summary:
 # 1. Request 1 adds images A, B, C — completely filling the cache.
 # 2. Request 2 tries to add image_G and image_A, but image_G cannot be added because
 #    cache is full and A is protected from eviction — cache remains unchanged.
 # 3. Request 3 adds image_G, image_H, image_I and image_B
 #    this time, image_A is evicted, freeing 5MB space
 #    and image_G, image_H successfully fits,
 #    image_B is protected from eviction then image_i cannot be added.
 #    This proving normal eviction and reuse behavior.
 def _run_test_cache_eviction_shm(
    p0_cache: BaseMultiModalProcessorCache,
    p1_cache: BaseMultiModalReceiverCache,
    base_item_size: int,
 ):
    request1_hashes = ["image_A", "image_B", "image_C"]
    request1_items = {
        h: MultiModalKwargsItem.dummy(h, nbytes=5 * base_item_size)
        for h in request1_hashes
    }
    request1_items_p0_result = []
    request2_hashes = ["image_G", "image_A"]
    request2_items = {
        h: MultiModalKwargsItem.dummy(
            h, nbytes=(5 if h in request1_hashes else 2) * base_item_size
        )
        for h in request2_hashes
    }
    request2_items_p0_result = []
    request3_hashes = ["image_G", "image_H", "image_I", "image_B"]
    request3_items = {
        h: MultiModalKwargsItem.dummy(
            h, nbytes=(5 if h in request1_hashes else 2) * base_item_size
        )
        for h in request3_hashes
    }
    request3_items_p0_result = []
    ##########################
    # STEP 1: Request 1 send
    # This will fill up the cache
    ##########################
    sender_is_cached_item_req1 = p0_cache.is_cached(request1_hashes)
    # Cache is empty
    assert sender_is_cached_item_req1 == [False, False, False]
    # Touch all mm hash for P0 Cache before process
    for mm_hash in request1_hashes:
        p0_cache.touch_sender_cache_item(mm_hash)
    ###########################
    # Process request 1 for P0 Cache
    ###########################
    item_tuple: MultiModalProcessorCacheInItem
    for i, h in enumerate(request1_hashes):
        # Use precomputed cache state
        is_cached = sender_is_cached_item_req1[i]
        item_tuple = (request1_items[h], []) if not is_cached else None
        print(f"Request 1: key={h} | cached={is_cached}")
        p0_result = p0_cache.get_and_update_item(item_tuple, h)
        # Only get mm item, ignore prompt update result
        request1_items_p0_result.append(p0_result[0])
    ###########################
    # Process request 1 for P1 Cache
    ###########################
    # Touch all mm hash for P1 Cache before process
    for mm_hash, mm_item in zip(request1_hashes, request1_items_p0_result):
        p1_cache.touch_receiver_cache_item(mm_hash, mm_item)
    for mm_hash, mm_item in zip(request1_hashes, request1_items_p0_result):
        p1_cache.get_and_update_item(mm_item, mm_hash)
    expected_hashes = ["image_A", "image_B", "image_C"]
    assert list(p0_cache._shm_cache.key_index.keys()) == expected_hashes
    ##########################
    # STEP 2: Request 2 send
    # There is no eviction because image_A is protected
    # No new item can add to cache
    ##########################
    sender_is_cached_item_req2 = p0_cache.is_cached(request2_hashes)
    assert sender_is_cached_item_req2 == [False, True]
    # Touch all mm hash for P0 Cache before process
    for mm_hash in request2_hashes:
        p0_cache.touch_sender_cache_item(mm_hash)
    ###########################
    # Process request 2 for P0 Cache
    ###########################
    for i, h in enumerate(request2_hashes):
        # Use precomputed cache state again
        is_cached = sender_is_cached_item_req2[i]
        item_tuple = (request2_items[h], []) if not is_cached else None
        print(f"Request 2: key={h} | cached={is_cached}")
        p0_result = p0_cache.get_and_update_item(item_tuple, h)
        # Only get mm item, ignore prompt update result
        request2_items_p0_result.append(p0_result[0])
    # image_A cannot be evict then
    # image_G will fail to allocate anyway and image_A still in cache
    assert p0_cache.is_cached(request2_hashes) == [False, True]
    ###########################
    # Process request 2 for P1 Cache
    ###########################
    # Touch all mm hash for P1 Cache before process
    for mm_hash, mm_item in zip(request2_hashes, request2_items_p0_result):
        p1_cache.touch_receiver_cache_item(mm_hash, mm_item)
    for mm_hash, mm_item in zip(request2_hashes, request2_items_p0_result):
        p1_cache.get_and_update_item(mm_item, mm_hash)
    # Prove that cache state is unchanged
    expected_hashes = ["image_A", "image_B", "image_C"]
    assert list(p0_cache._shm_cache.key_index.keys()) == expected_hashes
    ##########################
    # STEP 3: Request 3 send
    ##########################
    ##### Prove that cache eviction work normally
    sender_is_cached_item_req3 = p0_cache.is_cached(request3_hashes)
    assert sender_is_cached_item_req3 == [False, False, False, True]
    # Touch all mm hash for P0 Cache before process
    for mm_hash in request3_hashes:
        p0_cache.touch_sender_cache_item(mm_hash)
    ###########################
    # Process request 3 for P0 Cache
    ###########################
    for i, h in enumerate(request3_hashes):
        # Use precomputed cache state again
        is_cached = sender_is_cached_item_req3[i]
        item_tuple = (request3_items[h], []) if not is_cached else None
        print(f"Request 3: key={h} | cached={is_cached}")
        p0_result = p0_cache.get_and_update_item(item_tuple, h)
        # Only get mm item, ignore prompt update result
        request3_items_p0_result.append(p0_result[0])
    # image_A got evict and image_G add to cache
    # image_B is still protected
    # image_G, image_H fit but image_I cannot fit
    assert p0_cache.is_cached(request3_hashes) == [True, True, False, True]
    ###########################
    # Process request 3 for P1 Cache
    ###########################
    # Touch all mm hash for P1 Cache before process
    for mm_hash, mm_item in zip(request3_hashes, request3_items_p0_result):
        p1_cache.touch_receiver_cache_item(mm_hash, mm_item)
    for mm_hash, mm_item in zip(request3_hashes, request3_items_p0_result):
        p1_cache.get_and_update_item(mm_item, mm_hash)
    expected_hashes = ["image_B", "image_C", "image_G", "image_H"]
    assert list(p0_cache._shm_cache.key_index.keys()) == expected_hashes
 def test_cache_eviction_shm_cache():
    vllm_config = VllmConfig(
        model_config=ModelConfig(
            model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
            mm_processor_cache_type="shm",
            mm_shm_cache_max_object_size_mb=6,
            mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
        ),
    )
    sender_cache = ShmObjectStoreSenderCache(vllm_config)
    receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
    _run_test_cache_eviction_shm(sender_cache, receiver_cache, base_item_size=MiB_bytes)
--- a/vllm/distributed/device_communicators/shm_object_storage.py
+++ b/vllm/distributed/device_communicators/shm_object_storage.py
@ -574,7 +574,6 @@ class SingleWriterShmObjectStorage:
            value
        )
        buffer_size = self.flag_bytes + data_bytes + md_bytes
        # Sanity checks
        if buffer_size > self.max_object_size:
            raise ValueError(
@ -626,6 +625,44 @@ class SingleWriterShmObjectStorage:
        return obj
    def touch(
        self,
        key: str,
        address: int = 0,
        monotonic_id: int = 0,
    ) -> None:
        """
        Touch an existing cached item to update its eviction status.
        For writers (ShmObjectStoreSenderCache): Increment writer_flag
        For readers (ShmObjectStoreReceiverCache): Increment reader_count
        Args:
            key: String key of the object to touch
            address: Address of the object (only for readers)
            monotonic_id: Monotonic ID of the object (only for readers)
        """
        if self._reader_lock is None:
            if key not in self.key_index:
                return None
            address, monotonic_id = self.key_index[key]
            # Writer side: increment writer_flag to raise eviction threshold
            self.increment_writer_flag(monotonic_id)
        else:
            with (
                self._reader_lock,
                self.ring_buffer.access_buf(address) as (data_view, _),
            ):
                reader_count = self.ring_buffer.byte2int(data_view[: self.flag_bytes])
                # NOTE(Long):
                # Avoid increasing flag on newly added item (sync with sender)
                # Since when a new item is added
                # pre-touch has no effect on writer side
                if reader_count >= self.n_readers:
                    self.increment_reader_flag(data_view[: self.flag_bytes])
    def handle(self):
        """Get handle for sharing across processes."""
        return ShmObjectStorageHandle(
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@ -302,6 +302,19 @@ class BaseMultiModalProcessorCache(
        """
        return [self.is_cached_item(mm_hash) for mm_hash in mm_hashes]
    @abstractmethod
    def touch_sender_cache_item(self, mm_hash: str) -> None:
        """
        Update the cache eviction order for a multi-modal item.
        This is used to touch the item in the cache without changing
        its value.
        Args:
            mm_hash: The hash of the multi-modal item.
        """
        raise NotImplementedError
    @abstractmethod
    def make_stats(self, *, delta: bool = False) -> CacheInfo:
        """
@ -353,6 +366,10 @@ class MultiModalProcessorOnlyCache(BaseMultiModalProcessorCache):
        return mm_item
    @override
    def touch_sender_cache_item(self, mm_hash: str) -> None:
        self._cache.touch(mm_hash)
    @override
    def clear_cache(self) -> None:
        self._cache.clear()
@ -407,6 +424,10 @@ class MultiModalProcessorSenderCache(BaseMultiModalProcessorCache):
        return mm_item
    @override
    def touch_sender_cache_item(self, mm_hash: str) -> None:
        self._cache.touch(mm_hash)
    @override
    def clear_cache(self) -> None:
        self._cache.clear()
@ -501,6 +522,12 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
            logger.debug("Failed to cache mm_input with hash %s: %s", mm_hash, e)
            return mm_item
    @override
    def touch_sender_cache_item(self, mm_hash: str) -> None:
        """Touch the item in shared memory cache to prevent eviction.
        Increments writer_flag on sender side."""
        self._shm_cache.touch(mm_hash)
    @override
    def clear_cache(self) -> None:
        self._shm_cache.clear()
@ -610,11 +637,37 @@ class BaseMultiModalReceiverCache(
        self,
        mm_features: list["MultiModalFeatureSpec"],
    ) -> list["MultiModalFeatureSpec"]:
-        """Update multimodal features with cached encoder outputs."""
+        """
        Update multimodal features with cached encoder outputs.
        Touch all identifier at first before update to avoid
        item in updated list evict during update.
        """
        for feature in mm_features:
            self.touch_receiver_cache_item(feature.identifier, feature.data)
        for feature in mm_features:
            feature.data = self.get_and_update_item(feature.data, feature.identifier)
        return mm_features
    @abstractmethod
    def touch_receiver_cache_item(
        self,
        mm_hash: str,
        mm_item: MultiModalKwargsItem | None = None,
    ) -> None:
        """
        Update the cache eviction order for a multi-modal item.
        This is used to touch the item in the cache without changing
        its value.
        Args:
            mm_hash: The hash of the multi-modal item.
            mm_item: The multi-modal item itself. This is optional and
                may not be needed by some cache implementations.
        """
        raise NotImplementedError
 class MultiModalReceiverCache(BaseMultiModalReceiverCache):
    """
@ -651,6 +704,14 @@ class MultiModalReceiverCache(BaseMultiModalReceiverCache):
        self._cache[mm_hash] = mm_item
        return mm_item
    @override
    def touch_receiver_cache_item(
        self,
        mm_hash: str,
        mm_item: MultiModalKwargsItem | None = None,
    ) -> None:
        self._cache.touch(mm_hash)
    @override
    def clear_cache(self) -> None:
        self._cache.clear()
@ -703,6 +764,20 @@ class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache):
        return mm_item
    @override
    def touch_receiver_cache_item(
        self,
        mm_hash: str,
        mm_item: MultiModalKwargsItem | None = None,
    ) -> None:
        """Touch the item in shared memory cache to prevent eviction.
        Increments reader_count on receiver side."""
        assert mm_item is not None
        if "address" in mm_item:
            address = cast(int, mm_item["address"].data)
            monotonic_id = cast(int, mm_item["monotonic_id"].data)
            self._shm_cache.touch(mm_hash, address=address, monotonic_id=monotonic_id)
    @override
    def clear_cache(self) -> None:
        self._shm_cache.clear()
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@ -721,12 +721,12 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
    """
    @staticmethod
-    def dummy(modality: str):
+    def dummy(modality: str, nbytes: int = 1):
        """Convenience class for testing."""
        mm_elem = MultiModalFieldElem(
            modality=modality,
            key="dummy",
-            data=torch.empty(1),
+            data=torch.empty(nbytes, dtype=torch.uint8),
            field=MultiModalSharedField(1),
        )
        return MultiModalKwargsItem.from_elems([mm_elem])
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@ -1248,7 +1248,13 @@ _I = TypeVar("_I", bound=BaseProcessingInfo)
 MultiModalHashes = dict[str, list[str]]
 """
-A collection of hashes with a similar structure as
+A collection of the multi-modal hash for each item, with a similar structure as
 [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
 MultiModalIsCached = dict[str, list[bool]]
 """
 A collection of the `is_cached` flag for each item, with a similar structure as
 [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
@ -1725,7 +1731,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        cache: BaseMultiModalProcessorCache,
        mm_data_items: MultiModalDataItems,
        mm_hashes: MultiModalHashes,
-    ) -> MultiModalDataItems:
+    ) -> tuple[MultiModalIsCached, MultiModalDataItems]:
        mm_is_cached = {
            modality: cache.is_cached(hashes) for modality, hashes in mm_hashes.items()
        }
@ -1752,7 +1758,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
                    missing_modality_data.append(data)
            mm_missing_data[modality] = missing_modality_data
-        return self._to_mm_items(mm_missing_data)
+        return mm_is_cached, self._to_mm_items(mm_missing_data)
    def _recompute_cached_prompt_update(
        self,
@ -1769,14 +1775,15 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        self,
        cache: BaseMultiModalProcessorCache,
        mm_hashes: MultiModalHashes,
        mm_is_cached: MultiModalIsCached,
        mm_missing_kwargs: MultiModalKwargsItems,
        mm_missing_prompt_updates: MultiModalPromptUpdates,
    ) -> tuple[MultiModalKwargsOptionalItems, MultiModalPromptUpdates]:
-        # Need to calculate this at the beginning to avoid skipping cache logic
+        # Need to touch all mm hashes before update to avoid hash in updated
-        # for subsequently repeated items in the same modality
+        # list evict during update
-        mm_is_cached = {
+        for hashes in mm_hashes.values():
-            modality: cache.is_cached(hashes) for modality, hashes in mm_hashes.items()
+            for item_hash in hashes:
-        }
+                cache.touch_sender_cache_item(item_hash)
        mm_missing_next_idx = defaultdict[str, int](lambda: 0)
@ -1789,15 +1796,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
            missing_prompt_updates = mm_missing_prompt_updates.get(modality, [])
            for item_idx, item_hash in enumerate(hashes):
                kwargs: MultiModalKwargsItem | None
                if not mm_is_cached[modality][item_idx]:
                    missing_next_idx = mm_missing_next_idx[modality]
-                    kwargs = missing_kwargs[missing_next_idx]
+                    missing_kwargs_item = missing_kwargs[missing_next_idx]
-                    updates = missing_prompt_updates[missing_next_idx]
+                    missing_updates_item = missing_prompt_updates[missing_next_idx]
                    mm_missing_next_idx[modality] += 1
-                    item = kwargs, updates
+                    item = missing_kwargs_item, missing_updates_item
                else:
                    item = None
@ -1896,7 +1902,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
            mm_uuids=mm_uuids,
        )
-        mm_missing_data_items = self._get_cache_missing_items(
+        mm_is_cached, mm_missing_data_items = self._get_cache_missing_items(
            cache=cache,
            mm_data_items=mm_data_items,
            mm_hashes=mm_hashes,
@ -1933,6 +1939,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
            cache,
            mm_hashes=mm_hashes,
            mm_is_cached=mm_is_cached,
            mm_missing_kwargs=mm_missing_kwargs,
            mm_missing_prompt_updates=mm_missing_prompt_updates,
        )