[Docs] Improve V1 KVConnector interface documentation (#19172)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill 2025-06-06 01:22:45 -07:00 committed by GitHub
parent 94870359cd
commit 65c69444b1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 32 additions and 9 deletions

View File

@ -8,9 +8,15 @@ The class provides the following primitives:
Scheduler-side: runs in the scheduler, binds metadata, which
is used by the worker-side to load/save KV cache.
get_num_new_matched_tokens() - get number of new tokens
that exist in the remote KV cache
that exist in the remote KV cache. Might be called multiple
times for a given request and should be side-effect free.
update_state_after_alloc() - update KVConnector state after
temporary buffer alloc by the CacheManager.
request_finished() - called when a request is finished, with
the computed kv cache blocks for the request.
Returns whether KV cache should be freed now or will be
freed asynchronously and optionally returns KV transfer
params.
Worker-side: runs in each worker, loads/saves KV cache to/from
the Connector based on the metadata.
@ -19,6 +25,9 @@ The class provides the following primitives:
save_kv_layer() - starts saving KV for layer i (maybe async)
wait_for_save() - blocks until all saves are done
get_finished() - called with ids of finished requests, returns
ids of requests that have completed async sending/recving.
"""
import enum
@ -184,7 +193,8 @@ class KVConnectorBase_V1(ABC):
finished generating tokens.
Returns:
ids of requests that have finished asynchronous transfer,
ids of requests that have finished asynchronous transfer
(requests that previously returned True from request_finished()),
tuple of (sending/saving ids, recving/loading ids).
The finished saves/sends req ids must belong to a set provided in a
call to this method (this call or a prior one).
@ -215,7 +225,8 @@ class KVConnectorBase_V1(ABC):
- The number of tokens that can be loaded from the
external KV cache beyond what is already computed.
- `True` if external KV cache tokens will be loaded
asynchronously (between scheduler steps).
asynchronously (between scheduler steps). Must be
'False' if the first element is 0.
"""
pass
@ -225,6 +236,18 @@ class KVConnectorBase_V1(ABC):
num_external_tokens: int):
"""
Update KVConnector state after block allocation.
If get_num_new_matched_tokens previously returned True for a
request, this function may be called twice for that same request -
first when blocks are allocated for the connector tokens to be
asynchronously loaded into, and second when any additional blocks
are allocated, after the load/transfer is complete.
Args:
request (Request): the request object.
blocks (KVCacheBlocks): the blocks allocated for the request.
num_external_tokens (int): the number of tokens that will be
loaded from the external KV cache.
"""
pass

View File

@ -101,7 +101,7 @@ class Scheduler(SchedulerInterface):
# This is flushed at the end of each scheduling step.
self.finished_req_ids: set[str] = set()
# P/D: requests in process of recving KV transfers
# KV Connector: requests in process of async KV loading or recving
self.finished_recving_kv_req_ids: set[str] = set()
# OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
@ -822,7 +822,7 @@ class Scheduler(SchedulerInterface):
if not stopped:
new_running.append(request)
# P/D: update state for finished KV Transfers.
# KV Connector: update state for finished KV Transfers.
self._update_from_kv_xfer_finished(model_runner_output)
# Return the cached request data to the queue so they can be reused.
@ -969,7 +969,7 @@ class Scheduler(SchedulerInterface):
self.kv_event_publisher.shutdown()
########################################################################
# P/D Related Methods
# KV Connector Related Methods
########################################################################
def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
@ -992,7 +992,7 @@ class Scheduler(SchedulerInterface):
def _update_waiting_for_remote_kv(self, request: Request) -> bool:
"""
P/D: check if the request_id is finished_recving.
KV Connector: check if the request_id is finished_recving.
The finished_recving_kv_req_ids list is populated
on the previous steps()'s update_from_output based
@ -1029,7 +1029,7 @@ class Scheduler(SchedulerInterface):
def _update_from_kv_xfer_finished(self,
model_runner_output: ModelRunnerOutput):
"""
P/D: update the scheduler state based on the output.
KV Connector: update the scheduler state based on the output.
The Worker side connectors add finished_recving and
finished_sending reqs to the output.
@ -1037,7 +1037,7 @@ class Scheduler(SchedulerInterface):
# if finished_recving: add to state so we can
scheduler the request during the next step.
"""
# P/D: update recv and send status from last step.
# KV Connector:: update recv and send status from last step.
for req_id in (model_runner_output.finished_recving or ()):
logger.debug("Finished recving KV transfer for request %s", req_id)
self.finished_recving_kv_req_ids.add(req_id)