From 188a4590d852324113a80cef6d0c237d6417a9d6 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Thu, 5 Jun 2025 04:14:32 -0700 Subject: [PATCH] [Misc] Do not override NCCL_CUMEM_ENABLE if set explicitly (#19105) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- vllm/env_override.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/env_override.py b/vllm/env_override.py index b0a061d2c4ed9..2bede4963f964 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -4,17 +4,23 @@ import os import torch +from vllm.logger import init_logger + +logger = init_logger(__name__) + # set some common config/environment variables that should be set # for all processes created by vllm and all processes # that interact with vllm workers. # they are executed whenever `import vllm` is called. -if not os.path.exists('/dev/nvidia-caps-imex-channels'): - # normally, we disable NCCL_CUMEM_ENABLE because it - # will cost 1~2 GiB GPU memory with cudagraph+allreduce, - # see https://github.com/NVIDIA/nccl/issues/1234 - # for more details. - # However, NCCL requires NCCL_CUMEM_ENABLE to work with +if 'NCCL_CUMEM_ENABLE' in os.environ: + logger.warning( + "NCCL_CUMEM_ENABLE is set to %s, skipping override. " + "This may increase memory overhead with cudagraph+allreduce: " + "https://github.com/NVIDIA/nccl/issues/1234", + os.environ['NCCL_CUMEM_ENABLE']) +elif not os.path.exists('/dev/nvidia-caps-imex-channels'): + # NCCL requires NCCL_CUMEM_ENABLE to work with # multi-node NVLink, typically on GB200-NVL72 systems. # The ultimate way to detect multi-node NVLink is to use # NVML APIs, which are too expensive to call here.