diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index 97ee06034137..66b44c27d6ee 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -3,6 +3,8 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +from packaging import version + from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( MINIMUM_BITBLAS_VERSION, ) @@ -10,7 +12,7 @@ from vllm.model_executor.layers.quantization.utils.bitblas_utils import ( try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION): raise ImportError( "bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}" diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md index 334df5dc9b7f..6b7086776025 100644 --- a/docs/design/arch_overview.md +++ b/docs/design/arch_overview.md @@ -200,7 +200,8 @@ vision-language model. lora_config = vllm_config.lora_config super().__init__(config, cache_config, quant_config, lora_config, prefix) - if __version__ >= "0.6.4": + from packaging import version + if version.parse(__version__) >= version.parse("0.6.4"): MyModel = MyNewModel else: MyModel = MyOldModel diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py index c27b377aebe9..f82ce5b4d4b6 100644 --- a/vllm/attention/ops/triton_decode_attention.py +++ b/vllm/attention/ops/triton_decode_attention.py @@ -31,6 +31,8 @@ It supports page size >= 1. import logging +from packaging import version + from vllm.platforms import current_platform from vllm.triton_utils import tl, triton @@ -40,7 +42,7 @@ logger = logging.getLogger(__name__) # Only print the following warnings when triton version < 3.2.0. # The issue won't affect performance or accuracy. -if triton.__version__ < '3.2.0': +if version.parse(triton.__version__) < version.parse('3.2.0'): logger.warning( "The following error message 'operation scheduled before its operands' " "can be ignored.") diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py index aa8eee88a9f9..39bd34d351f6 100644 --- a/vllm/model_executor/layers/quantization/bitblas.py +++ b/vllm/model_executor/layers/quantization/bitblas.py @@ -3,6 +3,7 @@ from typing import Any, Optional import torch +from packaging import version from vllm.logger import init_logger from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase @@ -45,7 +46,8 @@ class BitBLASConfig(QuantizationConfig): ) -> None: try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse(bitblas.__version__) < version.parse( + MINIMUM_BITBLAS_VERSION): raise ImportError( "bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}") diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 5359189caa2a..0204ff46852f 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -4,6 +4,7 @@ from typing import Any, Callable, Optional, Union import torch +from packaging import version from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, FusedMoEMethodBase) @@ -169,7 +170,8 @@ class BitsAndBytesLinearMethod(LinearMethodBase): def __init__(self, quant_config: BitsAndBytesConfig): try: import bitsandbytes - if bitsandbytes.__version__ < "0.46.1": + if version.parse( + bitsandbytes.__version__) < version.parse("0.46.1"): raise ImportError("bitsandbytes version is wrong. Please " "install bitsandbytes>=0.46.1.") except ImportError as err: @@ -412,7 +414,8 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): def __init__(self, quant_config: BitsAndBytesConfig): try: import bitsandbytes - if bitsandbytes.__version__ < "0.46.1": + if version.parse( + bitsandbytes.__version__) < version.parse("0.46.1"): raise ImportError("bitsandbytes version is wrong. Please " "install bitsandbytes>=0.46.1.") except ImportError as err: diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index 8030be525944..2922aef32939 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -6,6 +6,7 @@ from typing import Any, Optional import torch import torch.nn as nn import torch.nn.functional as F +from packaging import version from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase from vllm.model_executor.layers.quantization import QuantizationMethods @@ -145,7 +146,7 @@ class DeepSpeedFPParameter(nn.Parameter): quant_config: DeepSpeedFPConfig): try: import deepspeed - if deepspeed.__version__ < "0.14.2": + if version.parse(deepspeed.__version__) < version.parse("0.14.2"): raise ImportError("deepspeed version is wrong. Please " "install deepspeed>=0.14.2.") from deepspeed.ops.fp_quantizer import FP_Quantize diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py index caeb266d0b93..d03074f86184 100644 --- a/vllm/model_executor/layers/quantization/gptq_bitblas.py +++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py @@ -3,6 +3,7 @@ from typing import Any, Optional import torch +from packaging import version from torch.nn.parameter import Parameter from vllm.logger import init_logger @@ -63,7 +64,8 @@ class GPTQBitBLASConfig(QuantizationConfig): try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse(bitblas.__version__) < version.parse( + MINIMUM_BITBLAS_VERSION): raise ImportError( "bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}") diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 428e9b882bca..9c458954f960 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -4,6 +4,7 @@ from typing import Any, Optional import torch +from packaging import version from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) @@ -135,7 +136,8 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod): try: import intel_extension_for_pytorch as ipex - if ipex.__version__ < MIN_IPEX_VERSION: + if version.parse( + ipex.__version__) < version.parse(MIN_IPEX_VERSION): raise ImportError( "intel_extension_for_pytorch version is " "wrong. Please install " @@ -199,7 +201,8 @@ class IPEXAWQLinearMethod(AWQLinearMethod): try: import intel_extension_for_pytorch as ipex - if ipex.__version__ < MIN_IPEX_VERSION: + if version.parse( + ipex.__version__) < version.parse(MIN_IPEX_VERSION): raise ImportError( "intel_extension_for_pytorch version is " "wrong. Please install " diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py index 649d07b4d072..0eca3b4c024e 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py @@ -4,6 +4,7 @@ from typing import Optional import torch +from packaging import version from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( @@ -110,7 +111,8 @@ class BitBLASLinearKernel(MPLinearKernel): try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse(bitblas.__version__) < version.parse( + MINIMUM_BITBLAS_VERSION): raise ImportError( "bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}") diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py index 82ee3edfd5e1..4c2e54873586 100644 --- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py +++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py @@ -3,6 +3,7 @@ from typing import Optional import torch +from packaging import version from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types @@ -75,7 +76,8 @@ def _check_bitblas_supported( # Finally, check if bitblas is installed try: import bitblas - if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + if version.parse( + bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION): raise ImportError("bitblas version is wrong. Please " f"install bitblas>={MINIMUM_BITBLAS_VERSION}") except ImportError: diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 47bb45793281..ddb50968904d 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -4,6 +4,7 @@ from typing import Callable, Optional, Union import torch +from packaging import version from vllm import _custom_ops as ops from vllm import envs @@ -21,8 +22,8 @@ TORCH_DEVICE_IDENTITY = None # torch._scaled_mm rowwise feature. # The condition is determined once as the operations # are time consuming. -USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() - and torch.__version__[0:3] >= "2.7" +USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse( + torch.__version__) >= version.parse("2.7") and current_platform.has_device_capability(94)) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index f54dfab5238e..ea2fb2e3ac14 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -12,6 +12,7 @@ from typing import Any, Callable, Optional import numpy as np import torch from huggingface_hub import HfApi +from packaging import version from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME @@ -193,7 +194,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): try: import bitsandbytes - if bitsandbytes.__version__ < "0.46.1": + if version.parse( + bitsandbytes.__version__) < version.parse("0.46.1"): raise ImportError("bitsandbytes version is wrong. Please " "install bitsandbytes>=0.46.1.") except ImportError as err: diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 460e1c0b05bc..e0434c8f3d71 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -5,6 +5,7 @@ from typing import Optional import torch import torch.nn as nn +from packaging import version from vllm import envs from vllm.logger import init_logger @@ -32,7 +33,7 @@ class TopKTopPSampler(nn.Module): if current_platform.is_cuda(): if is_flashinfer_available: flashinfer_version = flashinfer.__version__ - if flashinfer_version < "0.2.3": + if version.parse(flashinfer_version) < version.parse("0.2.3"): logger.warning_once( "FlashInfer version >= 0.2.3 required. " "Falling back to default sampling implementation.")