mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-29 09:14:50 +08:00
[Chore] Separate out vllm.utils.platform_utils.py (#27374)
Signed-off-by: Jonathan <chenleejonathan@gmail.com>
This commit is contained in:
parent
a9f55dc588
commit
ca76486a16
@ -3,7 +3,7 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.utils import is_uva_available
|
||||
from vllm.utils.platform_utils import is_uva_available
|
||||
from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
|
||||
|
||||
CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
|
||||
|
||||
@ -21,7 +21,7 @@ from tests.v1.sample.utils import (
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.v1.sample.logits_processor import (
|
||||
BatchUpdate,
|
||||
BatchUpdateBuilder,
|
||||
|
||||
@ -7,7 +7,7 @@ import torch
|
||||
|
||||
from tests.v1.sample.utils import create_allowed_token_ids
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.utils.torch_utils import make_tensor_with_pad
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessors
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
|
||||
@ -10,7 +10,7 @@ import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.utils.torch_utils import make_tensor_with_pad
|
||||
from vllm.v1.pool.metadata import PoolingMetadata
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessors
|
||||
|
||||
@ -18,7 +18,7 @@ from typing import Any
|
||||
import torch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ import torch
|
||||
import torch.types
|
||||
|
||||
from vllm.lora.peft_helper import PEFTHelper
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
|
||||
|
||||
class LoRALayerWeights:
|
||||
|
||||
@ -31,8 +31,8 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal
|
||||
from vllm.model_executor.models.interfaces import is_pooling_model
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.cache import LRUCache
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ -26,7 +26,7 @@ from vllm.model_executor.models.adapters import (
|
||||
try_create_mm_pooling_model_cls,
|
||||
)
|
||||
from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@ -72,7 +72,7 @@ from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
||||
from vllm.multimodal.parse import MultiModalDataItems
|
||||
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import (
|
||||
|
||||
@ -22,8 +22,8 @@ from vllm.logger import init_logger
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.multimodal import NestedTensors
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils import (
|
||||
cdiv,
|
||||
from vllm.utils import cdiv
|
||||
from vllm.utils.platform_utils import (
|
||||
is_pin_memory_available,
|
||||
is_uva_available,
|
||||
)
|
||||
|
||||
@ -21,7 +21,7 @@ import torch
|
||||
import vllm.envs as envs
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import cuda_get_device_properties
|
||||
from vllm.utils.platform_utils import cuda_get_device_properties
|
||||
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
|
||||
@ -27,12 +27,8 @@ from argparse import (
|
||||
_ArgumentGroup,
|
||||
)
|
||||
from collections import defaultdict
|
||||
from collections.abc import (
|
||||
Callable,
|
||||
Sequence,
|
||||
)
|
||||
from concurrent.futures.process import ProcessPoolExecutor
|
||||
from functools import cache, partial, wraps
|
||||
from collections.abc import Callable
|
||||
from functools import partial, wraps
|
||||
from typing import TYPE_CHECKING, Any, TypeVar
|
||||
|
||||
import cloudpickle
|
||||
@ -44,6 +40,7 @@ import yaml
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import enable_trace_function_call, init_logger
|
||||
from vllm.ray.lazy_utils import is_in_ray_actor
|
||||
from vllm.utils.platform_utils import cuda_is_initialized, xpu_is_initialized
|
||||
|
||||
_DEPRECATED_MAPPINGS = {
|
||||
"cprofile": "profiling",
|
||||
@ -169,21 +166,6 @@ def round_down(x: int, y: int) -> int:
|
||||
return (x // y) * y
|
||||
|
||||
|
||||
@cache
|
||||
def is_pin_memory_available() -> bool:
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
return current_platform.is_pin_memory_available()
|
||||
|
||||
|
||||
@cache
|
||||
def is_uva_available() -> bool:
|
||||
"""Check if Unified Virtual Addressing (UVA) is available."""
|
||||
# UVA requires pinned memory.
|
||||
# TODO: Add more requirements for UVA if needed.
|
||||
return is_pin_memory_available()
|
||||
|
||||
|
||||
# TODO: This function can be removed if transformer_modules classes are
|
||||
# serialized by value when communicating between processes
|
||||
def init_cached_hf_modules() -> None:
|
||||
@ -216,35 +198,6 @@ def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
|
||||
enable_trace_function_call(log_path)
|
||||
|
||||
|
||||
def cuda_is_initialized() -> bool:
|
||||
"""Check if CUDA is initialized."""
|
||||
if not torch.cuda._is_compiled():
|
||||
return False
|
||||
return torch.cuda.is_initialized()
|
||||
|
||||
|
||||
def xpu_is_initialized() -> bool:
|
||||
"""Check if XPU is initialized."""
|
||||
if not torch.xpu._is_compiled():
|
||||
return False
|
||||
return torch.xpu.is_initialized()
|
||||
|
||||
|
||||
def cuda_get_device_properties(
|
||||
device, names: Sequence[str], init_cuda=False
|
||||
) -> tuple[Any, ...]:
|
||||
"""Get specified CUDA device property values without initializing CUDA in
|
||||
the current process."""
|
||||
if init_cuda or cuda_is_initialized():
|
||||
props = torch.cuda.get_device_properties(device)
|
||||
return tuple(getattr(props, name) for name in names)
|
||||
|
||||
# Run in subprocess to avoid initializing CUDA as a side effect.
|
||||
mp_ctx = multiprocessing.get_context("fork")
|
||||
with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
|
||||
return executor.submit(cuda_get_device_properties, device, names, True).result()
|
||||
|
||||
|
||||
def weak_bind(
|
||||
bound_method: Callable[..., Any],
|
||||
) -> Callable[..., None]:
|
||||
|
||||
54
vllm/utils/platform_utils.py
Normal file
54
vllm/utils/platform_utils.py
Normal file
@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import multiprocessing
|
||||
from collections.abc import Sequence
|
||||
from concurrent.futures.process import ProcessPoolExecutor
|
||||
from functools import cache
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def cuda_is_initialized() -> bool:
|
||||
"""Check if CUDA is initialized."""
|
||||
if not torch.cuda._is_compiled():
|
||||
return False
|
||||
return torch.cuda.is_initialized()
|
||||
|
||||
|
||||
def xpu_is_initialized() -> bool:
|
||||
"""Check if XPU is initialized."""
|
||||
if not torch.xpu._is_compiled():
|
||||
return False
|
||||
return torch.xpu.is_initialized()
|
||||
|
||||
|
||||
def cuda_get_device_properties(
|
||||
device, names: Sequence[str], init_cuda=False
|
||||
) -> tuple[Any, ...]:
|
||||
"""Get specified CUDA device property values without initializing CUDA in
|
||||
the current process."""
|
||||
if init_cuda or cuda_is_initialized():
|
||||
props = torch.cuda.get_device_properties(device)
|
||||
return tuple(getattr(props, name) for name in names)
|
||||
|
||||
# Run in subprocess to avoid initializing CUDA as a side effect.
|
||||
mp_ctx = multiprocessing.get_context("fork")
|
||||
with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
|
||||
return executor.submit(cuda_get_device_properties, device, names, True).result()
|
||||
|
||||
|
||||
@cache
|
||||
def is_pin_memory_available() -> bool:
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
return current_platform.is_pin_memory_available()
|
||||
|
||||
|
||||
@cache
|
||||
def is_uva_available() -> bool:
|
||||
"""Check if Unified Virtual Addressing (UVA) is available."""
|
||||
# UVA requires pinned memory.
|
||||
# TODO: Add more requirements for UVA if needed.
|
||||
return is_pin_memory_available()
|
||||
@ -34,12 +34,13 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import tl, triton
|
||||
from vllm.utils import cdiv, is_pin_memory_available
|
||||
from vllm.utils import cdiv
|
||||
from vllm.utils.flashinfer import (
|
||||
can_use_trtllm_attention,
|
||||
flashinfer_disable_q_quantization,
|
||||
use_trtllm_attention,
|
||||
)
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.v1.attention.backends.utils import (
|
||||
AttentionCGSupport,
|
||||
AttentionMetadataBuilder,
|
||||
|
||||
@ -7,7 +7,7 @@ import torch
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.attention import AttentionBackend
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
|
||||
from vllm.v1.kv_offload.worker.worker import (
|
||||
OffloadingHandler,
|
||||
|
||||
@ -5,7 +5,7 @@ from dataclasses import dataclass
|
||||
import torch
|
||||
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
|
||||
pin_memory = is_pin_memory_available()
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.utils import apply_penalties
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.utils.torch_utils import make_tensor_with_pad
|
||||
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config.model import LogprobsMode
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.v1.outputs import LogprobsTensors, SamplerOutput
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.sample.ops.bad_words import apply_bad_words
|
||||
|
||||
@ -24,7 +24,7 @@ from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
|
||||
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import is_pin_memory_available
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
|
||||
from vllm.v1.attention.backends.tree_attn import (
|
||||
TreeAttentionMetadata,
|
||||
|
||||
@ -72,13 +72,13 @@ from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
||||
from vllm.utils import (
|
||||
cdiv,
|
||||
check_use_alibi,
|
||||
is_pin_memory_available,
|
||||
length_from_prompt_token_ids_or_embeds,
|
||||
round_up,
|
||||
)
|
||||
from vllm.utils.jsontree import json_map_leaves
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_utils import DeviceMemoryProfiler
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.utils.torch_utils import (
|
||||
get_dtype_size,
|
||||
kv_cache_dtype_str_to_dtype,
|
||||
|
||||
@ -53,7 +53,8 @@ from vllm.multimodal.inputs import (
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
|
||||
from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available, prev_power_of_2
|
||||
from vllm.utils import LayerBlockType, cdiv, prev_power_of_2
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.v1.attention.backends.pallas import (
|
||||
TPU_STR_DTYPE_TO_TORCH_DTYPE,
|
||||
PallasAttentionBackend,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user