mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 14:15:55 +08:00
[NIXL] Increase default KV block eviction timeout on P (#25897)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
parent
8eb0a1d906
commit
2e4fe48c37
@ -84,7 +84,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
|
|||||||
- Connection info is passed via KVTransferParams from prefiller to decoder for handshake
|
- Connection info is passed via KVTransferParams from prefiller to decoder for handshake
|
||||||
|
|
||||||
- `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
|
- `VLLM_NIXL_ABORT_REQUEST_TIMEOUT`: Timeout (in seconds) for automatically releasing the prefiller’s KV cache for a particular request. (Optional)
|
||||||
- Default: 120
|
- Default: 480
|
||||||
- If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
|
- If a request is aborted and the decoder has not yet read the KV-cache blocks through the nixl channel, the prefill instance will release its KV-cache blocks after this timeout to avoid holding them indefinitely.
|
||||||
|
|
||||||
## Multi-Instance Setup
|
## Multi-Instance Setup
|
||||||
|
|||||||
@ -174,7 +174,7 @@ if TYPE_CHECKING:
|
|||||||
"NONE"] = "NONE"
|
"NONE"] = "NONE"
|
||||||
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
|
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
|
||||||
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
|
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
|
||||||
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
|
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480
|
||||||
VLLM_USE_CUDNN_PREFILL: bool = False
|
VLLM_USE_CUDNN_PREFILL: bool = False
|
||||||
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
|
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
|
||||||
VLLM_LOOPBACK_IP: str = ""
|
VLLM_LOOPBACK_IP: str = ""
|
||||||
@ -1330,7 +1330,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# consumer. This is only applicable when using NixlConnector in a
|
# consumer. This is only applicable when using NixlConnector in a
|
||||||
# disaggregated decode-prefill setup.
|
# disaggregated decode-prefill setup.
|
||||||
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
|
"VLLM_NIXL_ABORT_REQUEST_TIMEOUT":
|
||||||
lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "120")),
|
lambda: int(os.getenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", "480")),
|
||||||
|
|
||||||
# Controls whether or not to use cudnn prefill
|
# Controls whether or not to use cudnn prefill
|
||||||
"VLLM_USE_CUDNN_PREFILL":
|
"VLLM_USE_CUDNN_PREFILL":
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user