[Core] Use platform-agnostic device control for DP engine core (#17245)

Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
2025-12-10 12:05:48 +08:00 · 2025-05-13 03:09:16 +08:00 · 2025-05-13 03:09:16 +08:00 · 289199feb6
commit 289199feb6
parent b9fd0d7a69
4 changed files with 30 additions and 39 deletions
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -34,24 +34,6 @@ pynvml = import_pynvml()
 torch.backends.cuda.enable_cudnn_sdp(False)


-def device_id_to_physical_device_id(device_id: int) -> int:
-    if "CUDA_VISIBLE_DEVICES" in os.environ:
-        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-        if device_ids == [""]:
-            msg = (
-                "CUDA_VISIBLE_DEVICES is set to empty string, which means"
-                " GPU support is disabled. If you are using ray, please unset"
-                " the environment variable `CUDA_VISIBLE_DEVICES` inside the"
-                " worker/actor. "
-                "Check https://github.com/vllm-project/vllm/issues/8402 for"
-                " more information.")
-            raise RuntimeError(msg)
-        physical_device_id = device_ids[device_id]
-        return int(physical_device_id)
-    else:
-        return device_id
-
-
 def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:

    @wraps(fn)
@ -338,7 +320,7 @@ class NvmlCudaPlatform(CudaPlatformBase):
                              device_id: int = 0
                              ) -> Optional[DeviceCapability]:
        try:
-            physical_device_id = device_id_to_physical_device_id(device_id)
+            physical_device_id = cls.device_id_to_physical_device_id(device_id)
            handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
            major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
            return DeviceCapability(major=major, minor=minor)
@ -360,20 +342,20 @@ class NvmlCudaPlatform(CudaPlatformBase):
    @classmethod
    @with_nvml_context
    def get_device_name(cls, device_id: int = 0) -> str:
-        physical_device_id = device_id_to_physical_device_id(device_id)
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
        return cls._get_physical_device_name(physical_device_id)

    @classmethod
    @with_nvml_context
    def get_device_uuid(cls, device_id: int = 0) -> str:
-        physical_device_id = device_id_to_physical_device_id(device_id)
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
        return pynvml.nvmlDeviceGetUUID(handle)

    @classmethod
    @with_nvml_context
    def get_device_total_memory(cls, device_id: int = 0) -> int:
-        physical_device_id = device_id_to_physical_device_id(device_id)
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
        return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total)

--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import enum
+import os
 import platform
 import random
 from platform import uname
@ -161,6 +162,24 @@ class Platform:
    def is_sleep_mode_available(self) -> bool:
        return self._enum == PlatformEnum.CUDA

+    @classmethod
+    def device_id_to_physical_device_id(cls, device_id: int):
+        if cls.device_control_env_var in os.environ:
+            device_ids = os.environ[cls.device_control_env_var].split(",")
+            if device_ids == [""]:
+                msg = (f"{cls.device_control_env_var} is set to empty string, "
+                       "which means current platform support is disabled. If "
+                       "you are using ray, please unset the environment "
+                       f"variable `{cls.device_control_env_var}` inside the "
+                       "worker/actor. Check "
+                       "https://github.com/vllm-project/vllm/issues/8402 for "
+                       "more information.")
+                raise RuntimeError(msg)
+            physical_device_id = device_ids[device_id]
+            return int(physical_device_id)
+        else:
+            return device_id
+
    @classmethod
    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@ -95,15 +95,6 @@ def with_amdsmi_context(fn):
    return wrapper


-def device_id_to_physical_device_id(device_id: int) -> int:
-    if "CUDA_VISIBLE_DEVICES" in os.environ:
-        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
-        physical_device_id = device_ids[device_id]
-        return int(physical_device_id)
-    else:
-        return device_id
-
-
@cache
 def on_mi250_mi300() -> bool:
    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
@ -238,7 +229,7 @@ class RocmPlatform(Platform):
    @with_amdsmi_context
    @lru_cache(maxsize=8)
    def get_device_name(cls, device_id: int = 0) -> str:
-        physical_device_id = device_id_to_physical_device_id(device_id)
+        physical_device_id = cls.device_id_to_physical_device_id(device_id)
        handle = amdsmi_get_processor_handles()[physical_device_id]
        asic_info = amdsmi_get_gpu_asic_info(handle)
        device_name: str = asic_info["device_id"]
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -622,13 +622,12 @@ class DPEngineCoreProc(EngineCoreProc):
        assert 0 <= local_dp_rank <= dp_rank < dp_size

        from vllm.platforms import current_platform
-        if current_platform.is_cuda_alike():
-            from vllm.platforms.cuda import device_id_to_physical_device_id
-            tp_size = vllm_config.parallel_config.tensor_parallel_size
-            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
-                str(device_id_to_physical_device_id(i))
-                for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
-                               tp_size))
+        device_control_env_var = current_platform.device_control_env_var
+        tp_size = vllm_config.parallel_config.tensor_parallel_size
+        os.environ[device_control_env_var] = ",".join(
+            str(current_platform.device_id_to_physical_device_id(i))
+            for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
+                           tp_size))

        self.local_dp_rank = local_dp_rank
        self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()