[KV Connector] More async support for get_num_new_matched_tokens (#23620)

Signed-off-by: ApostaC <yihua98@uchicago.edu>
2025-12-22 12:25:01 +08:00 · 2025-09-09 21:23:37 -07:00 · 2025-09-09 21:23:37 -07:00 · b4a01aaf95
commit b4a01aaf95
parent 83dd28aae4
6 changed files with 23 additions and 8 deletions
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@ -243,7 +243,7 @@ class KVConnectorBase_V1(ABC):
        self,
        request: "Request",
        num_computed_tokens: int,
-    ) -> tuple[int, bool]:
+    ) -> tuple[Optional[int], bool]:
        """
        Get number of new tokens that can be loaded from the
        external KV cache beyond the num_computed_tokens.
@ -255,8 +255,11 @@ class KVConnectorBase_V1(ABC):
        Returns:
            A tuple with the following elements:
-                - The number of tokens that can be loaded from the 
+                - An optional number of tokens that can be loaded from the 
-                  external KV cache beyond what is already computed.
+                  external KV cache beyond what is already computed. 
                  If None, it means that the connector needs more time to
                  determine the number of matched tokens, and the scheduler
                  should query for this request again later.
                - `True` if external KV cache tokens will be loaded
                  asynchronously (between scheduler steps). Must be
                  'False' if the first element is 0.
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@ -110,7 +110,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
        self,
        request: "Request",
        num_computed_tokens: int,
-    ) -> tuple[int, bool]:
+    ) -> tuple[Optional[int], bool]:
        """
        Get number of new tokens that can be loaded from the
        external KV cache beyond the num_computed_tokens.
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@ -143,11 +143,15 @@ class MultiConnector(KVConnectorBase_V1):
        self,
        request: "Request",
        num_computed_tokens: int,
-    ) -> tuple[int, bool]:
+    ) -> tuple[Optional[int], bool]:
        to_return = (0, False)
        for i, c in enumerate(self._connectors):
            toks, load_async = c.get_num_new_matched_tokens(
                request, num_computed_tokens)
            # If there is a connector still looking up the matches,
            # we return None to indicate that we are not done yet.
            if toks is None:
                return (None, False)
            # The first connector that has new matched tokens will be assigned
            # to this request.
            if to_return[0] == 0 and toks > 0:
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@ -162,7 +162,7 @@ class NixlConnector(KVConnectorBase_V1):
    def get_num_new_matched_tokens(
            self, request: "Request",
-            num_computed_tokens: int) -> tuple[int, bool]:
+            num_computed_tokens: int) -> tuple[Optional[int], bool]:
        assert self.connector_scheduler is not None
        return self.connector_scheduler.get_num_new_matched_tokens(
            request, num_computed_tokens)
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@ -3,7 +3,7 @@
 import hashlib
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 import safetensors
 import torch
@ -238,7 +238,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
        self,
        request: "Request",
        num_computed_tokens: int,
-    ) -> tuple[int, bool]:
+    ) -> tuple[Optional[int], bool]:
        """
        Get number of new tokens that can be loaded from the
        external KV cache beyond the num_computed_tokens.
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@ -387,6 +387,14 @@ class Scheduler(SchedulerInterface):
                            self.connector.get_num_new_matched_tokens(
                                request, num_new_local_computed_tokens))
                        if num_external_computed_tokens is None:
                            # The request cannot be scheduled because
                            # the KVConnector couldn't determine
                            # the number of matched tokens.
                            self.waiting.pop_request()
                            skipped_waiting_requests.prepend_request(request)
                            continue
                    # Total computed tokens (local + external).
                    num_computed_tokens = (num_new_local_computed_tokens +
                                           num_external_computed_tokens)