[Chore] Separate out vllm.utils.platform_utils.py (#27374)

Signed-off-by: Jonathan <chenleejonathan@gmail.com>
This commit is contained in:
Jonathan Chen 2025-10-23 15:08:06 -04:00 committed by GitHub
parent a9f55dc588
commit ca76486a16
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 79 additions and 70 deletions

View File

@ -3,7 +3,7 @@
import pytest import pytest
import torch import torch
from vllm.utils import is_uva_available from vllm.utils.platform_utils import is_uva_available
from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]

View File

@ -21,7 +21,7 @@ from tests.v1.sample.utils import (
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.sample.logits_processor import ( from vllm.v1.sample.logits_processor import (
BatchUpdate, BatchUpdate,
BatchUpdateBuilder, BatchUpdateBuilder,

View File

@ -7,7 +7,7 @@ import torch
from tests.v1.sample.utils import create_allowed_token_ids from tests.v1.sample.utils import create_allowed_token_ids
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata

View File

@ -10,7 +10,7 @@ import torch
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import LogitsProcessors from vllm.v1.sample.logits_processor import LogitsProcessors

View File

@ -18,7 +18,7 @@ from typing import Any
import torch import torch
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@ -8,7 +8,7 @@ import torch
import torch.types import torch.types
from vllm.lora.peft_helper import PEFTHelper from vllm.lora.peft_helper import PEFTHelper
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
class LoRALayerWeights: class LoRALayerWeights:

View File

@ -31,8 +31,8 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal
from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.interfaces import is_pooling_model
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
from vllm.utils import is_pin_memory_available
from vllm.utils.cache import LRUCache from vllm.utils.cache import LRUCache
from vllm.utils.platform_utils import is_pin_memory_available
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@ -26,7 +26,7 @@ from vllm.model_executor.models.adapters import (
try_create_mm_pooling_model_cls, try_create_mm_pooling_model_cls,
) )
from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
logger = init_logger(__name__) logger = init_logger(__name__)

View File

@ -72,7 +72,7 @@ from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import PromptReplacement, PromptUpdate from vllm.multimodal.processing import PromptReplacement, PromptUpdate
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import ( from .interfaces import (

View File

@ -22,8 +22,8 @@ from vllm.logger import init_logger
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import NestedTensors from vllm.multimodal import NestedTensors
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils import ( from vllm.utils import cdiv
cdiv, from vllm.utils.platform_utils import (
is_pin_memory_available, is_pin_memory_available,
is_uva_available, is_uva_available,
) )

View File

@ -21,7 +21,7 @@ import torch
import vllm.envs as envs import vllm.envs as envs
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import cuda_get_device_properties from vllm.utils.platform_utils import cuda_get_device_properties
from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.utils.torch_utils import cuda_device_count_stateless
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION

View File

@ -27,12 +27,8 @@ from argparse import (
_ArgumentGroup, _ArgumentGroup,
) )
from collections import defaultdict from collections import defaultdict
from collections.abc import ( from collections.abc import Callable
Callable, from functools import partial, wraps
Sequence,
)
from concurrent.futures.process import ProcessPoolExecutor
from functools import cache, partial, wraps
from typing import TYPE_CHECKING, Any, TypeVar from typing import TYPE_CHECKING, Any, TypeVar
import cloudpickle import cloudpickle
@ -44,6 +40,7 @@ import yaml
import vllm.envs as envs import vllm.envs as envs
from vllm.logger import enable_trace_function_call, init_logger from vllm.logger import enable_trace_function_call, init_logger
from vllm.ray.lazy_utils import is_in_ray_actor from vllm.ray.lazy_utils import is_in_ray_actor
from vllm.utils.platform_utils import cuda_is_initialized, xpu_is_initialized
_DEPRECATED_MAPPINGS = { _DEPRECATED_MAPPINGS = {
"cprofile": "profiling", "cprofile": "profiling",
@ -169,21 +166,6 @@ def round_down(x: int, y: int) -> int:
return (x // y) * y return (x // y) * y
@cache
def is_pin_memory_available() -> bool:
from vllm.platforms import current_platform
return current_platform.is_pin_memory_available()
@cache
def is_uva_available() -> bool:
"""Check if Unified Virtual Addressing (UVA) is available."""
# UVA requires pinned memory.
# TODO: Add more requirements for UVA if needed.
return is_pin_memory_available()
# TODO: This function can be removed if transformer_modules classes are # TODO: This function can be removed if transformer_modules classes are
# serialized by value when communicating between processes # serialized by value when communicating between processes
def init_cached_hf_modules() -> None: def init_cached_hf_modules() -> None:
@ -216,35 +198,6 @@ def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
enable_trace_function_call(log_path) enable_trace_function_call(log_path)
def cuda_is_initialized() -> bool:
"""Check if CUDA is initialized."""
if not torch.cuda._is_compiled():
return False
return torch.cuda.is_initialized()
def xpu_is_initialized() -> bool:
"""Check if XPU is initialized."""
if not torch.xpu._is_compiled():
return False
return torch.xpu.is_initialized()
def cuda_get_device_properties(
device, names: Sequence[str], init_cuda=False
) -> tuple[Any, ...]:
"""Get specified CUDA device property values without initializing CUDA in
the current process."""
if init_cuda or cuda_is_initialized():
props = torch.cuda.get_device_properties(device)
return tuple(getattr(props, name) for name in names)
# Run in subprocess to avoid initializing CUDA as a side effect.
mp_ctx = multiprocessing.get_context("fork")
with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
return executor.submit(cuda_get_device_properties, device, names, True).result()
def weak_bind( def weak_bind(
bound_method: Callable[..., Any], bound_method: Callable[..., Any],
) -> Callable[..., None]: ) -> Callable[..., None]:

View File

@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import multiprocessing
from collections.abc import Sequence
from concurrent.futures.process import ProcessPoolExecutor
from functools import cache
from typing import Any
import torch
def cuda_is_initialized() -> bool:
"""Check if CUDA is initialized."""
if not torch.cuda._is_compiled():
return False
return torch.cuda.is_initialized()
def xpu_is_initialized() -> bool:
"""Check if XPU is initialized."""
if not torch.xpu._is_compiled():
return False
return torch.xpu.is_initialized()
def cuda_get_device_properties(
device, names: Sequence[str], init_cuda=False
) -> tuple[Any, ...]:
"""Get specified CUDA device property values without initializing CUDA in
the current process."""
if init_cuda or cuda_is_initialized():
props = torch.cuda.get_device_properties(device)
return tuple(getattr(props, name) for name in names)
# Run in subprocess to avoid initializing CUDA as a side effect.
mp_ctx = multiprocessing.get_context("fork")
with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
return executor.submit(cuda_get_device_properties, device, names, True).result()
@cache
def is_pin_memory_available() -> bool:
from vllm.platforms import current_platform
return current_platform.is_pin_memory_available()
@cache
def is_uva_available() -> bool:
"""Check if Unified Virtual Addressing (UVA) is available."""
# UVA requires pinned memory.
# TODO: Add more requirements for UVA if needed.
return is_pin_memory_available()

View File

@ -34,12 +34,13 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton from vllm.triton_utils import tl, triton
from vllm.utils import cdiv, is_pin_memory_available from vllm.utils import cdiv
from vllm.utils.flashinfer import ( from vllm.utils.flashinfer import (
can_use_trtllm_attention, can_use_trtllm_attention,
flashinfer_disable_q_quantization, flashinfer_disable_q_quantization,
use_trtllm_attention, use_trtllm_attention,
) )
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.attention.backends.utils import ( from vllm.v1.attention.backends.utils import (
AttentionCGSupport, AttentionCGSupport,
AttentionMetadataBuilder, AttentionMetadataBuilder,

View File

@ -7,7 +7,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.attention import AttentionBackend from vllm.attention import AttentionBackend
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.worker import ( from vllm.v1.kv_offload.worker.worker import (
OffloadingHandler, OffloadingHandler,

View File

@ -5,7 +5,7 @@ from dataclasses import dataclass
import torch import torch
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
pin_memory = is_pin_memory_available() pin_memory = is_pin_memory_available()

View File

@ -4,7 +4,7 @@
import torch import torch
from vllm.model_executor.layers.utils import apply_penalties from vllm.model_executor.layers.utils import apply_penalties
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad from vllm.utils.torch_utils import make_tensor_with_pad

View File

@ -6,7 +6,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from vllm.config.model import LogprobsMode from vllm.config.model import LogprobsMode
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.outputs import LogprobsTensors, SamplerOutput from vllm.v1.outputs import LogprobsTensors, SamplerOutput
from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.ops.bad_words import apply_bad_words from vllm.v1.sample.ops.bad_words import apply_bad_words

View File

@ -24,7 +24,7 @@ from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import is_pin_memory_available from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
from vllm.v1.attention.backends.tree_attn import ( from vllm.v1.attention.backends.tree_attn import (
TreeAttentionMetadata, TreeAttentionMetadata,

View File

@ -72,13 +72,13 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import ( from vllm.utils import (
cdiv, cdiv,
check_use_alibi, check_use_alibi,
is_pin_memory_available,
length_from_prompt_token_ids_or_embeds, length_from_prompt_token_ids_or_embeds,
round_up, round_up,
) )
from vllm.utils.jsontree import json_map_leaves from vllm.utils.jsontree import json_map_leaves
from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import DeviceMemoryProfiler from vllm.utils.mem_utils import DeviceMemoryProfiler
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import ( from vllm.utils.torch_utils import (
get_dtype_size, get_dtype_size,
kv_cache_dtype_str_to_dtype, kv_cache_dtype_str_to_dtype,

View File

@ -53,7 +53,8 @@ from vllm.multimodal.inputs import (
from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available, prev_power_of_2 from vllm.utils import LayerBlockType, cdiv, prev_power_of_2
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.attention.backends.pallas import ( from vllm.v1.attention.backends.pallas import (
TPU_STR_DTYPE_TO_TORCH_DTYPE, TPU_STR_DTYPE_TO_TORCH_DTYPE,
PallasAttentionBackend, PallasAttentionBackend,