[Bugfix][ROCm] Fixing trying to import non-existent symbols from libnccl.so (#25605)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
This commit is contained in:
Gregory Shtrasberg 2025-09-29 17:01:50 -04:00 committed by GitHub
parent 9bedac9623
commit 61a3431613
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -30,7 +30,9 @@ from typing import Any, Optional
import torch
from torch.distributed import ReduceOp
from vllm import envs
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import find_nccl_library
logger = init_logger(__name__)
@ -275,10 +277,27 @@ class NCCLLibrary:
if so_file not in NCCLLibrary.path_to_dict_mapping:
_funcs: dict[str, Any] = {}
for func in NCCLLibrary.exported_functions:
f = getattr(self.lib, func.name)
f.restype = func.restype
f.argtypes = func.argtypes
_funcs[func.name] = f
try:
f = getattr(self.lib, func.name)
f.restype = func.restype
f.argtypes = func.argtypes
_funcs[func.name] = f
except AttributeError:
if func.name in [
"ncclCommWindowRegister",
"ncclCommWindowDeregister"
]:
if envs.VLLM_USE_NCCL_SYMM_MEM:
logger.warning_once(
"The symbol %s is not found in the NCCL "
"library %s. To enable VLLM_USE_NCCL_SYMM_MEM "
" please update your NCCL version to >= "
"2.27.03.", func.name, so_file)
if current_platform.is_rocm():
# Having an exception here on ROCm platform is
# not allowed during graph capturing
continue
raise
NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]