diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index fb76276bb4b34..242690f8e1b8f 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -8,11 +8,11 @@ from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, import torch.nn as nn from typing_extensions import TypeVar +import vllm.platforms from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.utils import make_async @@ -108,8 +108,8 @@ class ExecutorBase(ABC): """ # NOTE: This is logged in the executor because there can be >1 workers. logger.info("# %s blocks: %d, # CPU blocks: %d", - current_platform.dispatch_key, num_gpu_blocks, - num_cpu_blocks) + vllm.platforms.current_platform.dispatch_key, + num_gpu_blocks, num_cpu_blocks) max_concurrency = (num_gpu_blocks * self.cache_config.block_size / self.model_config.max_model_len) logger.info("Maximum concurrency for %s tokens per request: %.2fx", diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7b30155971a6d..33c0a25803ca6 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -7,10 +7,10 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import msgspec +import vllm.platforms from vllm.config import ParallelConfig from vllm.executor.msgspec_utils import decode_hook, encode_hook from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import get_ip from vllm.worker.worker_base import WorkerWrapperBase @@ -54,10 +54,10 @@ try: def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: node_id = ray.get_runtime_context().get_node_id() - device_key = current_platform.ray_device_key + device_key = vllm.platforms.current_platform.ray_device_key if not device_key: raise RuntimeError("current platform %s does not support ray.", - current_platform.device_name) + vllm.platforms.current_platform.device_name) gpu_ids = ray.get_runtime_context().get_accelerator_ids( )[device_key] return node_id, gpu_ids diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 991d55ac861a4..9deb0294668ec 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -334,10 +334,10 @@ class NvmlCudaPlatform(CudaPlatformBase): if (len(set(device_names)) > 1 and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"): logger.warning( - "Detected different devices in the system: \n%s\nPlease" + "Detected different devices in the system: %s. Please" " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to " "avoid unexpected behavior.", - "\n".join(device_names), + ", ".join(device_names), )