diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 837a513cb75e..2291f363731f 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -9,7 +9,8 @@ import pytest import torch import vllm.v1.core.kv_cache_utils as kv_cache_utils -from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved +from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved, BlockStored +from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import ( MultiModalFeatureSpec, MultiModalKwargsItem, @@ -59,6 +60,7 @@ def make_request( mm_hashes: list[str] | None = None, prompt_logprobs: int | None = None, cache_salt: str | None = None, + lora_request: LoRARequest | None = None, ): mm_features = [] if mm_positions is not None: @@ -79,7 +81,7 @@ def make_request( sampling_params=SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs), pooling_params=None, eos_token_id=100, - lora_request=None, + lora_request=lora_request, cache_salt=cache_salt, block_hasher=get_request_block_hasher(block_size, hash_fn), ) @@ -1337,6 +1339,63 @@ def test_kv_cache_events(blocks_to_cache: int): assert len(manager.block_pool.cached_block_hash_to_block) == 0 +@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10]) +def test_kv_cache_events_with_lora(blocks_to_cache: int): + """Test BlockStored events contain correct lora_id when using LoRA requests.""" + block_size = 16 + num_blocks = blocks_to_cache + 1 + + # Create KVCacheManager with events enabled + manager = KVCacheManager( + make_kv_cache_config(block_size, num_blocks), + max_model_len=8192, + enable_caching=True, + enable_kv_cache_events=True, + ) + + # Test with LoRA request + lora_request = LoRARequest( + lora_name="test_lora", lora_int_id=42, lora_path="/test/path" + ) + + num_tokens = block_size * blocks_to_cache + req_with_lora = make_request( + "lora_req", + list(range(num_tokens)), + block_size, + sha256, + lora_request=lora_request, + ) + + # Allocate slots and get events + _ = manager.allocate_slots(req_with_lora, num_tokens) + events = manager.take_events() + + # Verify BlockStored event contains correct lora_id + block_stored_event = events[-1] + assert isinstance(block_stored_event, BlockStored) + assert block_stored_event.lora_id == 42 # Should match lora_request.adapter_id + assert len(block_stored_event.block_hashes) == blocks_to_cache + assert block_stored_event.block_size == block_size + + # Clean up + manager.free(req_with_lora) + + # Test without LoRA request (should have lora_id=None) + req_without_lora = make_request( + "no_lora_req", list(range(num_tokens)), block_size, sha256 + ) + + _ = manager.allocate_slots(req_without_lora, num_tokens) + events = manager.take_events() + + block_stored_event = events[-1] + assert isinstance(block_stored_event, BlockStored) + assert block_stored_event.lora_id is None # Should be None when no LoRA request + assert len(block_stored_event.block_hashes) == blocks_to_cache + assert block_stored_event.block_size == block_size + + def test_eagle_enabled_removes_last_block(): """Verify Eagle does NOT remove blocks when request length is divisible by block size.""" diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 15c06a0b107d..55710ad5cc69 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -259,7 +259,9 @@ class BlockPool: num_cached_blocks * block_size : num_full_blocks * block_size ], block_size=block_size, - lora_id=request.lora_request.id if request.lora_request else None, + lora_id=request.lora_request.adapter_id + if request.lora_request + else None, medium=MEDIUM_GPU, ) )