diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py index dee92976eb6f6..2690346af4d30 100644 --- a/tests/kernels/core/test_uva.py +++ b/tests/kernels/core/test_uva.py @@ -3,7 +3,7 @@ import pytest import torch -from vllm.utils import is_uva_available +from vllm.utils.platform_utils import is_uva_available from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] diff --git a/tests/v1/logits_processors/test_correctness.py b/tests/v1/logits_processors/test_correctness.py index 9682a7c0c8b35..dac7ffed69d4a 100644 --- a/tests/v1/logits_processors/test_correctness.py +++ b/tests/v1/logits_processors/test_correctness.py @@ -21,7 +21,7 @@ from tests.v1.sample.utils import ( from vllm.config import VllmConfig from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.sample.logits_processor import ( BatchUpdate, BatchUpdateBuilder, diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index a1513acc7b8e3..51f2bf5e753c0 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -7,7 +7,7 @@ import torch from tests.v1.sample.utils import create_allowed_token_ids from vllm.platforms import current_platform -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.torch_utils import make_tensor_with_pad from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.metadata import SamplingMetadata diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 132f0a58bbf5e..6ea65c6944b05 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -10,7 +10,7 @@ import torch from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.torch_utils import make_tensor_with_pad from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import LogitsProcessors diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 2586927864ab9..5e3dbde393be3 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -18,7 +18,7 @@ from typing import Any import torch from vllm.logger import init_logger -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available logger = init_logger(__name__) diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py index 4a8b35aeb5b84..7691481d5039e 100644 --- a/vllm/lora/lora_weights.py +++ b/vllm/lora/lora_weights.py @@ -8,7 +8,7 @@ import torch import torch.types from vllm.lora.peft_helper import PEFTHelper -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available class LoRALayerWeights: diff --git a/vllm/lora/models.py b/vllm/lora/models.py index fb2b9bdceddf7..02c252f15bfab 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -31,8 +31,8 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper -from vllm.utils import is_pin_memory_available from vllm.utils.cache import LRUCache +from vllm.utils.platform_utils import is_pin_memory_available logger = init_logger(__name__) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 88dfbc33e10bf..ba708a098c0da 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -26,7 +26,7 @@ from vllm.model_executor.models.adapters import ( try_create_mm_pooling_model_cls, ) from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available logger = init_logger(__name__) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 4f3c7fdb69365..1b3ce3edd47bc 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -72,7 +72,7 @@ from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import PromptReplacement, PromptUpdate from vllm.sequence import IntermediateTensors -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 022cd0fd2300d..e4f540dea8f71 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -22,8 +22,8 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import NestedTensors from vllm.sequence import IntermediateTensors -from vllm.utils import ( - cdiv, +from vllm.utils import cdiv +from vllm.utils.platform_utils import ( is_pin_memory_available, is_uva_available, ) diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 4211535131a42..a9f56d5d8f7ac 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -21,7 +21,7 @@ import torch import vllm.envs as envs from vllm.connections import global_http_connection from vllm.logger import init_logger -from vllm.utils import cuda_get_device_properties +from vllm.utils.platform_utils import cuda_get_device_properties from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.version import __version__ as VLLM_VERSION diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index a630279e9e228..38da04102c44b 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -27,12 +27,8 @@ from argparse import ( _ArgumentGroup, ) from collections import defaultdict -from collections.abc import ( - Callable, - Sequence, -) -from concurrent.futures.process import ProcessPoolExecutor -from functools import cache, partial, wraps +from collections.abc import Callable +from functools import partial, wraps from typing import TYPE_CHECKING, Any, TypeVar import cloudpickle @@ -44,6 +40,7 @@ import yaml import vllm.envs as envs from vllm.logger import enable_trace_function_call, init_logger from vllm.ray.lazy_utils import is_in_ray_actor +from vllm.utils.platform_utils import cuda_is_initialized, xpu_is_initialized _DEPRECATED_MAPPINGS = { "cprofile": "profiling", @@ -169,21 +166,6 @@ def round_down(x: int, y: int) -> int: return (x // y) * y -@cache -def is_pin_memory_available() -> bool: - from vllm.platforms import current_platform - - return current_platform.is_pin_memory_available() - - -@cache -def is_uva_available() -> bool: - """Check if Unified Virtual Addressing (UVA) is available.""" - # UVA requires pinned memory. - # TODO: Add more requirements for UVA if needed. - return is_pin_memory_available() - - # TODO: This function can be removed if transformer_modules classes are # serialized by value when communicating between processes def init_cached_hf_modules() -> None: @@ -216,35 +198,6 @@ def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None: enable_trace_function_call(log_path) -def cuda_is_initialized() -> bool: - """Check if CUDA is initialized.""" - if not torch.cuda._is_compiled(): - return False - return torch.cuda.is_initialized() - - -def xpu_is_initialized() -> bool: - """Check if XPU is initialized.""" - if not torch.xpu._is_compiled(): - return False - return torch.xpu.is_initialized() - - -def cuda_get_device_properties( - device, names: Sequence[str], init_cuda=False -) -> tuple[Any, ...]: - """Get specified CUDA device property values without initializing CUDA in - the current process.""" - if init_cuda or cuda_is_initialized(): - props = torch.cuda.get_device_properties(device) - return tuple(getattr(props, name) for name in names) - - # Run in subprocess to avoid initializing CUDA as a side effect. - mp_ctx = multiprocessing.get_context("fork") - with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor: - return executor.submit(cuda_get_device_properties, device, names, True).result() - - def weak_bind( bound_method: Callable[..., Any], ) -> Callable[..., None]: diff --git a/vllm/utils/platform_utils.py b/vllm/utils/platform_utils.py new file mode 100644 index 0000000000000..34ac820c6e9d6 --- /dev/null +++ b/vllm/utils/platform_utils.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import multiprocessing +from collections.abc import Sequence +from concurrent.futures.process import ProcessPoolExecutor +from functools import cache +from typing import Any + +import torch + + +def cuda_is_initialized() -> bool: + """Check if CUDA is initialized.""" + if not torch.cuda._is_compiled(): + return False + return torch.cuda.is_initialized() + + +def xpu_is_initialized() -> bool: + """Check if XPU is initialized.""" + if not torch.xpu._is_compiled(): + return False + return torch.xpu.is_initialized() + + +def cuda_get_device_properties( + device, names: Sequence[str], init_cuda=False +) -> tuple[Any, ...]: + """Get specified CUDA device property values without initializing CUDA in + the current process.""" + if init_cuda or cuda_is_initialized(): + props = torch.cuda.get_device_properties(device) + return tuple(getattr(props, name) for name in names) + + # Run in subprocess to avoid initializing CUDA as a side effect. + mp_ctx = multiprocessing.get_context("fork") + with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor: + return executor.submit(cuda_get_device_properties, device, names, True).result() + + +@cache +def is_pin_memory_available() -> bool: + from vllm.platforms import current_platform + + return current_platform.is_pin_memory_available() + + +@cache +def is_uva_available() -> bool: + """Check if Unified Virtual Addressing (UVA) is available.""" + # UVA requires pinned memory. + # TODO: Add more requirements for UVA if needed. + return is_pin_memory_available() diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index cd54b964c41ff..78fd7b3bcf736 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -34,12 +34,13 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton -from vllm.utils import cdiv, is_pin_memory_available +from vllm.utils import cdiv from vllm.utils.flashinfer import ( can_use_trtllm_attention, flashinfer_disable_q_quantization, use_trtllm_attention, ) +from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index eb7117a400b90..646f9d0d75423 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -7,7 +7,7 @@ import torch from vllm import _custom_ops as ops from vllm.attention import AttentionBackend from vllm.logger import init_logger -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.worker import ( OffloadingHandler, diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py index 2fb320dd2aaf8..9883ab8fb9964 100644 --- a/vllm/v1/pool/metadata.py +++ b/vllm/v1/pool/metadata.py @@ -5,7 +5,7 @@ from dataclasses import dataclass import torch from vllm.pooling_params import PoolingParams -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available pin_memory = is_pin_memory_available() diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py index 44f53d95dd3b7..898b90d41abae 100644 --- a/vllm/v1/sample/ops/penalties.py +++ b/vllm/v1/sample/ops/penalties.py @@ -4,7 +4,7 @@ import torch from vllm.model_executor.layers.utils import apply_penalties -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.torch_utils import make_tensor_with_pad diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 0cf1b4f89f31e..39c63fe31ad2c 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -6,7 +6,7 @@ import torch import torch.nn as nn from vllm.config.model import LogprobsMode -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.outputs import LogprobsTensors, SamplerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.ops.bad_words import apply_bad_words diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 206b13ad51649..635fcb2f8fb19 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -24,7 +24,7 @@ from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.platforms import current_platform -from vllm.utils import is_pin_memory_available +from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.tree_attn import ( TreeAttentionMetadata, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ebc8cfe92deb2..1f02ff1ead0ca 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -72,13 +72,13 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.utils import ( cdiv, check_use_alibi, - is_pin_memory_available, length_from_prompt_token_ids_or_embeds, round_up, ) from vllm.utils.jsontree import json_map_leaves from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_utils import DeviceMemoryProfiler +from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.torch_utils import ( get_dtype_size, kv_cache_dtype_str_to_dtype, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 2107df5fc1032..18b857a641366 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -53,7 +53,8 @@ from vllm.multimodal.inputs import ( from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.sequence import IntermediateTensors from vllm.tasks import GenerationTask, PoolingTask, SupportedTask -from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available, prev_power_of_2 +from vllm.utils import LayerBlockType, cdiv, prev_power_of_2 +from vllm.utils.platform_utils import is_pin_memory_available from vllm.v1.attention.backends.pallas import ( TPU_STR_DTYPE_TO_TORCH_DTYPE, PallasAttentionBackend,