[Chore] Separate out vllm.utils.platform_utils.py (#27374)

Signed-off-by: Jonathan <chenleejonathan@gmail.com>
This commit is contained in:
Jonathan Chen 2025-10-23 15:08:06 -04:00 committed by GitHub
parent a9f55dc588
commit ca76486a16
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 79 additions and 70 deletions

View File

@ -3,7 +3,7 @@
import pytest
import torch
from vllm.utils import is_uva_available
from vllm.utils.platform_utils import is_uva_available
from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]

View File

@ -21,7 +21,7 @@ from tests.v1.sample.utils import (
from vllm.config import VllmConfig
from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.sample.logits_processor import (
BatchUpdate,
BatchUpdateBuilder,

View File

@ -7,7 +7,7 @@ import torch
from tests.v1.sample.utils import create_allowed_token_ids
from vllm.platforms import current_platform
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata

View File

@ -10,7 +10,7 @@ import torch
from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import LogitsProcessors

View File

@ -18,7 +18,7 @@ from typing import Any
import torch
from vllm.logger import init_logger
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
logger = init_logger(__name__)

View File

@ -8,7 +8,7 @@ import torch
import torch.types
from vllm.lora.peft_helper import PEFTHelper
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
class LoRALayerWeights:

View File

@ -31,8 +31,8 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal
from vllm.model_executor.models.interfaces import is_pooling_model
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
from vllm.utils import is_pin_memory_available
from vllm.utils.cache import LRUCache
from vllm.utils.platform_utils import is_pin_memory_available
logger = init_logger(__name__)

View File

@ -26,7 +26,7 @@ from vllm.model_executor.models.adapters import (
try_create_mm_pooling_model_cls,
)
from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
logger = init_logger(__name__)

View File

@ -72,7 +72,7 @@ from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
from vllm.sequence import IntermediateTensors
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (

View File

@ -22,8 +22,8 @@ from vllm.logger import init_logger
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import NestedTensors
from vllm.sequence import IntermediateTensors
from vllm.utils import (
cdiv,
from vllm.utils import cdiv
from vllm.utils.platform_utils import (
is_pin_memory_available,
is_uva_available,
)

View File

@ -21,7 +21,7 @@ import torch
import vllm.envs as envs
from vllm.connections import global_http_connection
from vllm.logger import init_logger
from vllm.utils import cuda_get_device_properties
from vllm.utils.platform_utils import cuda_get_device_properties
from vllm.utils.torch_utils import cuda_device_count_stateless
from vllm.version import __version__ as VLLM_VERSION

View File

@ -27,12 +27,8 @@ from argparse import (
_ArgumentGroup,
)
from collections import defaultdict
from collections.abc import (
Callable,
Sequence,
)
from concurrent.futures.process import ProcessPoolExecutor
from functools import cache, partial, wraps
from collections.abc import Callable
from functools import partial, wraps
from typing import TYPE_CHECKING, Any, TypeVar
import cloudpickle
@ -44,6 +40,7 @@ import yaml
import vllm.envs as envs
from vllm.logger import enable_trace_function_call, init_logger
from vllm.ray.lazy_utils import is_in_ray_actor
from vllm.utils.platform_utils import cuda_is_initialized, xpu_is_initialized
_DEPRECATED_MAPPINGS = {
"cprofile": "profiling",
@ -169,21 +166,6 @@ def round_down(x: int, y: int) -> int:
return (x // y) * y
@cache
def is_pin_memory_available() -> bool:
from vllm.platforms import current_platform
return current_platform.is_pin_memory_available()
@cache
def is_uva_available() -> bool:
"""Check if Unified Virtual Addressing (UVA) is available."""
# UVA requires pinned memory.
# TODO: Add more requirements for UVA if needed.
return is_pin_memory_available()
# TODO: This function can be removed if transformer_modules classes are
# serialized by value when communicating between processes
def init_cached_hf_modules() -> None:
@ -216,35 +198,6 @@ def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
enable_trace_function_call(log_path)
def cuda_is_initialized() -> bool:
"""Check if CUDA is initialized."""
if not torch.cuda._is_compiled():
return False
return torch.cuda.is_initialized()
def xpu_is_initialized() -> bool:
"""Check if XPU is initialized."""
if not torch.xpu._is_compiled():
return False
return torch.xpu.is_initialized()
def cuda_get_device_properties(
device, names: Sequence[str], init_cuda=False
) -> tuple[Any, ...]:
"""Get specified CUDA device property values without initializing CUDA in
the current process."""
if init_cuda or cuda_is_initialized():
props = torch.cuda.get_device_properties(device)
return tuple(getattr(props, name) for name in names)
# Run in subprocess to avoid initializing CUDA as a side effect.
mp_ctx = multiprocessing.get_context("fork")
with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
return executor.submit(cuda_get_device_properties, device, names, True).result()
def weak_bind(
bound_method: Callable[..., Any],
) -> Callable[..., None]:

View File

@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import multiprocessing
from collections.abc import Sequence
from concurrent.futures.process import ProcessPoolExecutor
from functools import cache
from typing import Any
import torch
def cuda_is_initialized() -> bool:
"""Check if CUDA is initialized."""
if not torch.cuda._is_compiled():
return False
return torch.cuda.is_initialized()
def xpu_is_initialized() -> bool:
"""Check if XPU is initialized."""
if not torch.xpu._is_compiled():
return False
return torch.xpu.is_initialized()
def cuda_get_device_properties(
device, names: Sequence[str], init_cuda=False
) -> tuple[Any, ...]:
"""Get specified CUDA device property values without initializing CUDA in
the current process."""
if init_cuda or cuda_is_initialized():
props = torch.cuda.get_device_properties(device)
return tuple(getattr(props, name) for name in names)
# Run in subprocess to avoid initializing CUDA as a side effect.
mp_ctx = multiprocessing.get_context("fork")
with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
return executor.submit(cuda_get_device_properties, device, names, True).result()
@cache
def is_pin_memory_available() -> bool:
from vllm.platforms import current_platform
return current_platform.is_pin_memory_available()
@cache
def is_uva_available() -> bool:
"""Check if Unified Virtual Addressing (UVA) is available."""
# UVA requires pinned memory.
# TODO: Add more requirements for UVA if needed.
return is_pin_memory_available()

View File

@ -34,12 +34,13 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
)
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
from vllm.utils import cdiv, is_pin_memory_available
from vllm.utils import cdiv
from vllm.utils.flashinfer import (
can_use_trtllm_attention,
flashinfer_disable_q_quantization,
use_trtllm_attention,
)
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.attention.backends.utils import (
AttentionCGSupport,
AttentionMetadataBuilder,

View File

@ -7,7 +7,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.attention import AttentionBackend
from vllm.logger import init_logger
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.worker import (
OffloadingHandler,

View File

@ -5,7 +5,7 @@ from dataclasses import dataclass
import torch
from vllm.pooling_params import PoolingParams
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
pin_memory = is_pin_memory_available()

View File

@ -4,7 +4,7 @@
import torch
from vllm.model_executor.layers.utils import apply_penalties
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad

View File

@ -6,7 +6,7 @@ import torch
import torch.nn as nn
from vllm.config.model import LogprobsMode
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.outputs import LogprobsTensors, SamplerOutput
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.ops.bad_words import apply_bad_words

View File

@ -24,7 +24,7 @@ from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.platforms import current_platform
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
from vllm.v1.attention.backends.tree_attn import (
TreeAttentionMetadata,

View File

@ -72,13 +72,13 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import (
cdiv,
check_use_alibi,
is_pin_memory_available,
length_from_prompt_token_ids_or_embeds,
round_up,
)
from vllm.utils.jsontree import json_map_leaves
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import DeviceMemoryProfiler
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import (
get_dtype_size,
kv_cache_dtype_str_to_dtype,

View File

@ -53,7 +53,8 @@ from vllm.multimodal.inputs import (
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.sequence import IntermediateTensors
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available, prev_power_of_2
from vllm.utils import LayerBlockType, cdiv, prev_power_of_2
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.attention.backends.pallas import (
TPU_STR_DTYPE_TO_TORCH_DTYPE,
PallasAttentionBackend,