[distributed][misc] add specialized method for cuda platform (#7249)

2026-01-16 18:14:29 +08:00 · 2024-08-07 08:54:52 -07:00 · 2024-08-07 08:54:52 -07:00 · 639159b2a6
commit 639159b2a6
parent 66d617e343
3 changed files with 42 additions and 53 deletions
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@ -11,7 +11,8 @@ from vllm.distributed.device_communicators.custom_all_reduce_utils import (
    gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
-from vllm.utils import cuda_device_count_stateless, is_full_nvlink
+from vllm.platforms import current_platform
+from vllm.utils import cuda_device_count_stateless

 try:
    assert ops.is_custom_op_supported("_C_custom_ar::meta_size")
@ -113,7 +114,10 @@ class CustomAllreduce:
        # test nvlink first, this will filter out most of the cases
        # where custom allreduce is not supported
        # this checks hardware and driver support for NVLink
-        full_nvlink = is_full_nvlink(physical_device_ids)
+        assert current_platform.is_cuda()
+        from vllm.platforms.cuda import CudaPlatform
+        cuda_platform: CudaPlatform = current_platform
+        full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
        if world_size > 2 and not full_nvlink:
            logger.warning(
                "Custom allreduce is disabled because it's not supported on"
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -4,12 +4,21 @@ pynvml. However, it should not initialize cuda context.

 import os
 from functools import lru_cache, wraps
-from typing import Tuple
+from typing import List, Tuple

 import pynvml

+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum

+logger = init_logger(__name__)
+
+# NVML utils
+# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using NVML is that it will not initialize CUDA
+

 def with_nvml_context(fn):

@ -47,3 +56,29 @@ class CudaPlatform(Platform):
    def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
        physical_device_id = device_id_to_physical_device_id(device_id)
        return get_physical_device_capability(physical_device_id)
+
+    @staticmethod
+    @with_nvml_context
+    def is_full_nvlink(physical_device_ids: List[int]) -> bool:
+        """
+        query if the set of gpus are fully connected by nvlink (1 hop)
+        """
+        handles = [
+            pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                            handle, peer_handle,
+                            pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                            return False
+                    except pynvml.NVMLError as error:
+                        logger.error(
+                            "NVLink detection failed. This is normal if your"
+                            " machine has no NVLink equipped.",
+                            exc_info=error)
+                        return False
+        return True
--- a/vllm/utils.py
+++ b/vllm/utils.py
@ -1034,56 +1034,6 @@ def cuda_device_count_stateless() -> int:
    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)


-# NVML utils
-# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
-# all the related functions work on real physical device ids.
-# the major benefit of using NVML is that it will not initialize CUDA
-
-try:
-    import pynvml
-except ImportError:
-    # For non-NV devices
-    pynvml = None
-
-
-def with_nvml_context(fn):
-
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        if pynvml is not None:
-            pynvml.nvmlInit()
-        try:
-            return fn(*args, **kwargs)
-        finally:
-            if pynvml is not None:
-                pynvml.nvmlShutdown()
-
-    return wrapper
-
-
-@with_nvml_context
-def is_full_nvlink(device_ids: List[int]) -> bool:
-    """
-    query if the set of gpus are fully connected by nvlink (1 hop)
-    """
-    handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
-    for i, handle in enumerate(handles):
-        for j, peer_handle in enumerate(handles):
-            if i < j:
-                try:
-                    p2p_status = pynvml.nvmlDeviceGetP2PStatus(
-                        handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
-                    if p2p_status != pynvml.NVML_P2P_STATUS_OK:
-                        return False
-                except pynvml.NVMLError as error:
-                    logger.error(
-                        "NVLink detection failed. This is normal if your"
-                        " machine has no NVLink equipped.",
-                        exc_info=error)
-                    return False
-    return True
-
-
 #From: https://stackoverflow.com/a/4104188/2749989
 def run_once(f):