mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 01:15:31 +08:00
[Misc] Do not override NCCL_CUMEM_ENABLE if set explicitly (#19105)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
This commit is contained in:
parent
18093084be
commit
188a4590d8
@ -4,17 +4,23 @@ import os
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
# set some common config/environment variables that should be set
|
# set some common config/environment variables that should be set
|
||||||
# for all processes created by vllm and all processes
|
# for all processes created by vllm and all processes
|
||||||
# that interact with vllm workers.
|
# that interact with vllm workers.
|
||||||
# they are executed whenever `import vllm` is called.
|
# they are executed whenever `import vllm` is called.
|
||||||
|
|
||||||
if not os.path.exists('/dev/nvidia-caps-imex-channels'):
|
if 'NCCL_CUMEM_ENABLE' in os.environ:
|
||||||
# normally, we disable NCCL_CUMEM_ENABLE because it
|
logger.warning(
|
||||||
# will cost 1~2 GiB GPU memory with cudagraph+allreduce,
|
"NCCL_CUMEM_ENABLE is set to %s, skipping override. "
|
||||||
# see https://github.com/NVIDIA/nccl/issues/1234
|
"This may increase memory overhead with cudagraph+allreduce: "
|
||||||
# for more details.
|
"https://github.com/NVIDIA/nccl/issues/1234",
|
||||||
# However, NCCL requires NCCL_CUMEM_ENABLE to work with
|
os.environ['NCCL_CUMEM_ENABLE'])
|
||||||
|
elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
|
||||||
|
# NCCL requires NCCL_CUMEM_ENABLE to work with
|
||||||
# multi-node NVLink, typically on GB200-NVL72 systems.
|
# multi-node NVLink, typically on GB200-NVL72 systems.
|
||||||
# The ultimate way to detect multi-node NVLink is to use
|
# The ultimate way to detect multi-node NVLink is to use
|
||||||
# NVML APIs, which are too expensive to call here.
|
# NVML APIs, which are too expensive to call here.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user