diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 013ef3c1f5c36..c40dde26b741f 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -363,7 +363,7 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): num_rdma_bytes=num_rdma_bytes, low_latency_mode=True, num_qps_per_rank=num_qps_per_rank, - allow_nvlink_for_low_latency_mode=envs.VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK, + allow_nvlink_for_low_latency_mode=True, allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL, ) diff --git a/vllm/envs.py b/vllm/envs.py index 0548f01fc8cdf..2744335ed3d38 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -207,7 +207,6 @@ if TYPE_CHECKING: VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER" VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024 VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE: bool = False - VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK: bool = True VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL: bool = False VLLM_DBO_COMM_SMS: int = 20 VLLM_PATTERN_MATCH_DEBUG: str | None = None @@ -1400,11 +1399,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool( int(os.getenv("VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", "0")) ), - # Allow DeepEP to use nvlink for internode_ll kernel, turn this on for - # better latency on GB200 like system - "VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK": lambda: bool( - int(os.getenv("VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK", "1")) - ), # Allow DeepEP to use MNNVL (multi-node nvlink) for internode_ll kernel, # turn this for better latency on GB200 like system "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL": lambda: bool( @@ -1566,7 +1560,6 @@ def compute_hash() -> str: "VLLM_NVFP4_GEMM_BACKEND", "VLLM_USE_FBGEMM", "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", - "VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK", "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", ] for key in environment_variables_to_hash: