diff --git a/tests/kernels/attention/test_deepgemm_attention.py b/tests/kernels/attention/test_deepgemm_attention.py index f4b4fac840151..74a5d81179623 100644 --- a/tests/kernels/attention/test_deepgemm_attention.py +++ b/tests/kernels/attention/test_deepgemm_attention.py @@ -6,7 +6,7 @@ import pytest import torch from vllm.platforms import current_platform -from vllm.utils import cdiv, has_deep_gemm +from vllm.utils import cdiv from vllm.utils.deep_gemm import ( _ceil_to_ue8m0, calc_diff, @@ -15,6 +15,7 @@ from vllm.utils.deep_gemm import ( get_num_sms, get_paged_mqa_logits_metadata, ) +from vllm.utils.import_utils import has_deep_gemm def kv_cache_cast_to_fp8(x: torch.Tensor) -> torch.Tensor: diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 94a305a063c3a..c517e5c026b46 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -23,7 +23,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, ) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk -from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx +from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx from .mk_objects import ( TestMoEQuantConfig, diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index aa41f89cae7dc..21eeffb1c7264 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -35,9 +35,9 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( cutlass_fp8_supported, ) from vllm.platforms import current_platform -from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.deep_gemm import is_deep_gemm_supported from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx @dataclass diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py index 43fbd05775a28..90728c1e30a46 100644 --- a/tests/kernels/moe/parallel_utils.py +++ b/tests/kernels/moe/parallel_utils.py @@ -15,7 +15,7 @@ from torch.distributed import ProcessGroup from torch.multiprocessing import spawn # pyright: ignore[reportPrivateImportUsage] from typing_extensions import ParamSpec -from vllm.utils import has_deep_ep +from vllm.utils.import_utils import has_deep_ep from vllm.utils.network_utils import get_open_port if has_deep_ep(): diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 11b1e2ff3c27e..60f9f14b7f6f1 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -21,11 +21,11 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( modular_triton_fused_moe, ) from vllm.platforms import current_platform -from vllm.utils import has_deep_gemm from vllm.utils.deep_gemm import ( get_mk_alignment_for_contiguous_layout, is_deep_gemm_e8m0_used, ) +from vllm.utils.import_utils import has_deep_gemm dg_available = has_deep_gemm() diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 65cd3e110a0fa..d46f453488a98 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -21,8 +21,8 @@ from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.platforms import current_platform -from vllm.utils import has_deep_ep, has_deep_gemm from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported +from vllm.utils.import_utils import has_deep_ep, has_deep_gemm from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index 527c20fe6f80b..b49319a7e6f54 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -21,7 +21,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, ) from vllm.platforms import current_platform -from vllm.utils import has_deep_ep +from vllm.utils.import_utils import has_deep_ep from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index f78596d220bfa..b8f213d33c963 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -6,7 +6,7 @@ import pytest import torch import torch.nn.functional as F -from vllm.utils import has_triton_kernels +from vllm.utils.import_utils import has_triton_kernels if not has_triton_kernels(): pytest.skip( diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index a7beb313011ae..a46b0053e75a3 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -13,8 +13,8 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.config import VllmConfig, set_current_vllm_config from vllm.platforms import current_platform -from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.torch_utils import cuda_device_count_stateless from .modular_kernel_tools.common import ( diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index a6dfb5428c52e..55f092e7ea694 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -18,12 +18,12 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( w8a8_triton_block_scaled_mm, ) from vllm.platforms import current_platform -from vllm.utils import has_deep_gemm from vllm.utils.deep_gemm import ( fp8_gemm_nt, get_col_major_tma_aligned_tensor, per_block_cast_to_fp8, ) +from vllm.utils.import_utils import has_deep_gemm if current_platform.get_device_capability() < (9, 0): pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True) diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index bac613913e91f..2a6f34a9c4823 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -12,7 +12,6 @@ from tests.quantization.utils import is_quant_method_supported from vllm.attention.utils.fa_utils import flash_attn_supports_fp8 from vllm.platforms import current_platform from vllm.utils import STR_BACKEND_ENV_VAR - from ..utils import check_logprobs_close diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index fae48cbe33744..10db1c2c4e670 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -9,8 +9,8 @@ import vllm.envs as envs from vllm.distributed import get_dp_group, get_ep_group from vllm.forward_context import get_forward_context from vllm.logger import init_logger -from vllm.utils import has_deep_ep, has_pplx from vllm.utils.flashinfer import has_flashinfer_all2all +from vllm.utils.import_utils import has_deep_ep, has_pplx from .base_device_communicator import All2AllManagerBase, Cache diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 9e84f0d00b082..6d3477cd19916 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -14,8 +14,9 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( OCP_MX_Scheme, ) from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape -from vllm.utils import cdiv, has_triton_kernels +from vllm.utils import cdiv from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from vllm.utils.import_utils import has_triton_kernels logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 69a815a4e3a96..484b8aa9d107c 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -26,12 +26,12 @@ from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, ) -from vllm.utils import has_deep_gemm from vllm.utils.deep_gemm import ( get_mk_alignment_for_contiguous_layout, m_grouped_fp8_gemm_nt_contiguous, ) from vllm.utils.func_utils import run_once +from vllm.utils.import_utils import has_deep_gemm logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 01fa9b99379b6..badedfc54c382 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP, ) from vllm.triton_utils import tl, triton -from vllm.utils import has_triton_kernels +from vllm.utils.import_utils import has_triton_kernels logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index feff92a162eee..6a61df739d140 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -55,8 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum -from vllm.utils import cdiv, has_deep_ep, has_pplx, round_up +from vllm.utils import cdiv, round_up from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from vllm.utils.import_utils import has_deep_ep, has_pplx from vllm.utils.torch_utils import current_stream, direct_register_custom_op from vllm.v1.worker.ubatching import dbo_current_ubatch_id diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 447b31b92d8fa..e5681cb856258 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -93,7 +93,6 @@ from vllm.model_executor.parameter import ( from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.scalar_type import scalar_types -from vllm.utils import has_deep_gemm from vllm.utils.deep_gemm import ( fp8_gemm_nt, get_col_major_tma_aligned_tensor, @@ -102,6 +101,7 @@ from vllm.utils.deep_gemm import ( should_use_deepgemm_for_fp8_linear, ) from vllm.utils.flashinfer import has_flashinfer_moe +from vllm.utils.import_utils import has_deep_gemm if TYPE_CHECKING: from vllm.model_executor.models.utils import WeightsMapper diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index cf983fcf43c99..73f911514968f 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -48,11 +48,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_s from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.scalar_type import scalar_types -from vllm.utils import ( - has_triton_kernels, - round_up, -) +from vllm.utils import round_up from vllm.utils.flashinfer import has_flashinfer +from vllm.utils.import_utils import has_triton_kernels from vllm.utils.torch_utils import is_torch_equal_or_newer logger = init_logger(__name__) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 6704668c95265..230a6cfc52e30 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -5,7 +5,6 @@ import contextlib import datetime import enum import getpass -import importlib import inspect import json import multiprocessing @@ -1062,46 +1061,6 @@ def check_use_alibi(model_config: ModelConfig) -> bool: ) -@cache -def _has_module(module_name: str) -> bool: - """Return True if *module_name* can be found in the current environment. - - The result is cached so that subsequent queries for the same module incur - no additional overhead. - """ - return importlib.util.find_spec(module_name) is not None - - -def has_pplx() -> bool: - """Whether the optional `pplx_kernels` package is available.""" - - return _has_module("pplx_kernels") - - -def has_deep_ep() -> bool: - """Whether the optional `deep_ep` package is available.""" - - return _has_module("deep_ep") - - -def has_deep_gemm() -> bool: - """Whether the optional `deep_gemm` package is available.""" - - return _has_module("deep_gemm") - - -def has_triton_kernels() -> bool: - """Whether the optional `triton_kernels` package is available.""" - - return _has_module("triton_kernels") - - -def has_tilelang() -> bool: - """Whether the optional `tilelang` package is available.""" - - return _has_module("tilelang") - - def set_process_title( name: str, suffix: str = "", prefix: str = envs.VLLM_PROCESS_NAME_PREFIX ) -> None: diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index fce4111f19bb2..2e8cd302b0f55 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -16,7 +16,8 @@ import torch import vllm.envs as envs from vllm.logger import logger from vllm.platforms import current_platform -from vllm.utils import cdiv, has_deep_gemm +from vllm.utils import cdiv +from vllm.utils.import_utils import has_deep_gemm @functools.cache diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py index fdc3d356a7ebe..65f588b52e5e6 100644 --- a/vllm/utils/import_utils.py +++ b/vllm/utils/import_utils.py @@ -324,3 +324,39 @@ class LazyLoader(ModuleType): if self._module is None: self._module = self._load() return dir(self._module) + + +# Optional dependency detection utilities +@cache +def _has_module(module_name: str) -> bool: + """Return True if *module_name* can be found in the current environment. + + The result is cached so that subsequent queries for the same module incur + no additional overhead. + """ + return importlib.util.find_spec(module_name) is not None + + +def has_pplx() -> bool: + """Whether the optional `pplx_kernels` package is available.""" + return _has_module("pplx_kernels") + + +def has_deep_ep() -> bool: + """Whether the optional `deep_ep` package is available.""" + return _has_module("deep_ep") + + +def has_deep_gemm() -> bool: + """Whether the optional `deep_gemm` package is available.""" + return _has_module("deep_gemm") + + +def has_triton_kernels() -> bool: + """Whether the optional `triton_kernels` package is available.""" + return _has_module("triton_kernels") + + +def has_tilelang() -> bool: + """Whether the optional `tilelang` package is available.""" + return _has_module("tilelang") diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py index 3e6fd86e95d88..9de123263755b 100644 --- a/vllm/v1/worker/gpu_ubatch_wrapper.py +++ b/vllm/v1/worker/gpu_ubatch_wrapper.py @@ -22,7 +22,7 @@ from vllm.forward_context import ( from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import has_deep_gemm +from vllm.utils.import_utils import has_deep_gemm from vllm.v1.worker.ubatching import UBatchContext, make_ubatch_contexts logger = init_logger(__name__)