From 2e4fe48c370e833350eae092eddd1490b65ff529 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 29 Sep 2025 23:35:14 +0200
Subject: [PATCH] [NIXL] Increase default KV block eviction timeout on P
 (#25897)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 docs/features/nixl_connector_usage.md | 2 +-
 vllm/envs.py                          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index afecbc82947b..5e273af05dc5 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -84,7 +84,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
     - Connection info is passed via KVTransferParams from prefiller to decoder for handshake
 
 - `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
-    - Default: 120
+    - Default: 480
     - If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
 
 ## Multi-Instance Setup
diff --git a/vllm/envs.py b/vllm/envs.py
index f06c860b8297..ffa7ed5c3aa5 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -174,7 +174,7 @@ if TYPE_CHECKING:
                                                  "NONE"] = "NONE"
     VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
     VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
-    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
+    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480
     VLLM_USE_CUDNN_PREFILL: bool = False
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
@@ -1330,7 +1330,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # consumer. This is only applicable when using NixlConnector in a
     # disaggregated decode-prefill setup.
     "VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
-    lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")),
+    lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")),
 
     # Controls whether or not to use cudnn prefill
     "VLLM_USE_CUDNN_PREFILL":