diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py index 19f6bd5726102..30c3986f2fa40 100644 --- a/examples/online_serving/kv_events_subscriber.py +++ b/examples/online_serving/kv_events_subscriber.py @@ -28,8 +28,14 @@ class BlockStored(KVCacheEvent): parent_block_hash: ExternalBlockHash | None token_ids: list[int] block_size: int + lora_id: int | None + """Deprecated: use `lora_name` for KV block key hash. + Retained for backward compatibility. + """ + medium: str | None + lora_name: str | None class BlockRemoved(KVCacheEvent): diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index c8d25f9700bf1..a0e2e5e25a47e 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -733,6 +733,7 @@ def test_kv_cache_events( ) assert event.parent_block_hash is None, "Parent block hash should be None" assert event.lora_id is None, "Lora id should be None" + assert event.lora_name is None, "Lora name should be None" assert len(event.token_ids) == num_blocks * block_size, ( "Token ids should be the same as the custom tokens" ) diff --git a/tests/v1/kv_connector/unit/test_lmcache_connector.py b/tests/v1/kv_connector/unit/test_lmcache_connector.py index 6a8cfc71a67a6..c3df2b68b1ff1 100644 --- a/tests/v1/kv_connector/unit/test_lmcache_connector.py +++ b/tests/v1/kv_connector/unit/test_lmcache_connector.py @@ -25,6 +25,7 @@ def mock_lmcache_engine_event(): lora_id, block_size, medium, + lora_name, ): self.block_hashes = block_hashes self.parent_block_hash = parent_block_hash @@ -32,6 +33,7 @@ def mock_lmcache_engine_event(): self.lora_id = lora_id self.block_size = block_size self.medium = medium + self.lora_name = lora_name return MockEvent( block_hashes=["hash1", "hash2"], @@ -40,6 +42,7 @@ def mock_lmcache_engine_event(): lora_id=None, block_size=16, medium="GPU", + lora_name=None, ) @@ -109,6 +112,7 @@ class TestGetKVConnectorKVCacheEvents: assert events[0].lora_id is None assert events[0].block_size == 16 assert events[0].medium == "GPU" + assert events[0].lora_name is None def test_converts_multiple_events(self, mock_connector): """Test conversion of multiple events from lmcache engine format.""" @@ -121,6 +125,7 @@ class TestGetKVConnectorKVCacheEvents: self.lora_id = None self.block_size = 16 self.medium = "GPU" + self.lora_name = None events = [MockEvent(i) for i in range(5)] mock_connector._lmcache_engine.get_kv_events.return_value = events @@ -150,6 +155,7 @@ class TestGetKVConnectorKVCacheEvents: self.lora_id = 42 self.block_size = 32 self.medium = "DISK" + self.lora_name = "lora_example" mock_connector._lmcache_engine.get_kv_events.return_value = [ MockEventWithLora() @@ -166,6 +172,7 @@ class TestGetKVConnectorKVCacheEvents: assert event.lora_id == 42 assert event.block_size == 32 assert event.medium == "DISK" + assert event.lora_name == "lora_example" def test_handles_none_parent_block_hash(self, mock_connector): """Test handling of events with None parent_block_hash.""" @@ -178,6 +185,7 @@ class TestGetKVConnectorKVCacheEvents: self.lora_id = None self.block_size = 16 self.medium = "GPU" + self.lora_name = None mock_connector._lmcache_engine.get_kv_events.return_value = [ MockEventNoParent() @@ -223,6 +231,7 @@ class TestUpdateConnectorOutput: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) kv_events.add_events([event]) @@ -243,6 +252,7 @@ class TestUpdateConnectorOutput: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) existing_events.add_events([event1]) existing_events.add_events([event1]) # Simulate 2 workers reporting @@ -258,6 +268,7 @@ class TestUpdateConnectorOutput: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) new_events.add_events([event2]) @@ -288,6 +299,7 @@ class TestUpdateConnectorOutput: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) new_events.add_events([event]) @@ -309,6 +321,7 @@ class TestUpdateConnectorOutput: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) events1.add_events([event1]) output1 = KVConnectorOutput(kv_cache_events=events1) @@ -323,6 +336,7 @@ class TestUpdateConnectorOutput: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) events2.add_events([event2]) output2 = KVConnectorOutput(kv_cache_events=events2) @@ -337,6 +351,7 @@ class TestUpdateConnectorOutput: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) events3.add_events([event3]) output3 = KVConnectorOutput(kv_cache_events=events3) @@ -358,6 +373,7 @@ class TestUpdateConnectorOutput: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) events1.add_events([event1]) output1 = KVConnectorOutput(kv_cache_events=events1) @@ -397,6 +413,7 @@ class TestTakeEvents: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) event2 = BlockStored( block_hashes=["hash2"], @@ -405,6 +422,7 @@ class TestTakeEvents: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) kv_events.add_events([event1, event2]) mock_connector._kv_cache_events = kv_events @@ -431,6 +449,7 @@ class TestTakeEvents: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) uncommon_event = BlockStored( block_hashes=["hash_uncommon"], @@ -439,6 +458,7 @@ class TestTakeEvents: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) # All 3 workers report common_event @@ -469,6 +489,7 @@ class TestTakeEvents: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) kv_events1.add_events([event1]) mock_connector._kv_cache_events = kv_events1 @@ -491,6 +512,7 @@ class TestTakeEvents: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) kv_events2.add_events([event2]) mock_connector._kv_cache_events = kv_events2 @@ -510,6 +532,7 @@ class TestTakeEvents: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) event2 = BlockStored( block_hashes=["hash2"], @@ -518,6 +541,7 @@ class TestTakeEvents: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) # Worker 1 reports event1 @@ -572,6 +596,7 @@ class TestIntegrationScenarios: self.lora_id = None self.block_size = 16 self.medium = "GPU" + self.lora_name = None # Worker 1 mock_connector._lmcache_engine.get_kv_events.return_value = [ @@ -628,6 +653,7 @@ class TestIntegrationScenarios: self.lora_id = None self.block_size = 16 self.medium = "GPU" + self.lora_name = None for cycle in range(3): # Get events @@ -667,6 +693,7 @@ class TestIntegrationScenarios: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) worker1_unique_event = BlockStored( @@ -676,6 +703,7 @@ class TestIntegrationScenarios: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) worker2_unique_event = BlockStored( @@ -685,6 +713,7 @@ class TestIntegrationScenarios: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) worker3_unique_event = BlockStored( @@ -694,6 +723,7 @@ class TestIntegrationScenarios: block_size=16, lora_id=None, medium="GPU", + lora_name=None, ) # Create events for each worker diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py index 69565f584ab89..e1ea0b2980565 100644 --- a/tests/v1/kv_connector/unit/test_offloading_connector.py +++ b/tests/v1/kv_connector/unit/test_offloading_connector.py @@ -528,6 +528,7 @@ def test_offloading_connector(request_runner): assert event.token_ids == [] assert event.parent_block_hash is None assert event.lora_id is None + assert event.lora_name is None event = events[1] assert isinstance(event, BlockRemoved) assert event.block_hashes == to_hashes([4, 5, 6]) diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 3b76af75504de..123af17ef0912 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -51,8 +51,14 @@ class BlockStored(KVCacheEvent): parent_block_hash: ExternalBlockHash | None token_ids: list[int] block_size: int + lora_id: int | None + """Deprecated: use `lora_name` for KV block key hash. + Retained for backward compatibility. + """ + medium: str | None + lora_name: str | None def __hash__(self) -> int: return hash( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 17d468fe6c305..6b1bdd4113626 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -218,6 +218,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1): lora_id=e.lora_id, block_size=e.block_size, medium=e.medium, + lora_name=e.lora_name, ) for e in events ] diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index a6d86bc9e1a19..99f6f9157b36d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -406,6 +406,7 @@ class OffloadingConnectorScheduler: lora_id=None, block_size=event.block_size, medium=event.medium, + lora_name=None, ) diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index a6f06d1b16a34..7da0cce482b27 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -286,6 +286,9 @@ class BlockPool: if request.lora_request else None, medium=MEDIUM_GPU, + lora_name=request.lora_request.name + if request.lora_request + else None, ) )