[NIXL] Increase default KV block eviction timeout on P (#25897)

Signed-off-by: NickLucche <nlucches@redhat.com>
2025-12-10 14:15:55 +08:00 · 2025-09-29 23:35:14 +02:00 · 2025-09-29 23:35:14 +02:00 · 2e4fe48c37
commit 2e4fe48c37
parent 8eb0a1d906
2 changed files with 3 additions and 3 deletions
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@ -84,7 +84,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
    - Connection info is passed via KVTransferParams from prefiller to decoder for handshake
 - `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
-    - Default: 120
+    - Default: 480
    - If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
 ## Multi-Instance Setup
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -174,7 +174,7 @@ if TYPE_CHECKING:
                                                 "NONE"] = "NONE"
    VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
    VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
-    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
+    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480
    VLLM_USE_CUDNN_PREFILL: bool = False
    VLLM_ENABLE_CUDAGRAPH_GC: bool = False
    VLLM_LOOPBACK_IP: str = ""
@ -1330,7 +1330,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # consumer. This is only applicable when using NixlConnector in a
    # disaggregated decode-prefill setup.
    "VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
-    lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")),
+    lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")),
    # Controls whether or not to use cudnn prefill
    "VLLM_USE_CUDNN_PREFILL":