From 373538f973ac4ea93a4b675655b893bf495ac050 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 13 Aug 2024 19:05:15 -0400 Subject: [PATCH] [Misc] `compressed-tensors` code reuse (#7277) --- requirements-common.txt | 1 + requirements-test.txt | 2 +- tests/quantization/test_compressed_tensors.py | 3 +- .../compressed_tensors/compressed_tensors.py | 7 +- .../schemes/compressed_tensors_w8a16_fp8.py | 3 +- .../schemes/compressed_tensors_w8a8_fp8.py | 3 +- .../schemes/compressed_tensors_w8a8_int8.py | 3 +- .../quantization/compressed_tensors/utils.py | 76 +------------------ 8 files changed, 13 insertions(+), 85 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 170c3e06ba226..2f006c887dab7 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -23,3 +23,4 @@ pyzmq librosa # Required for audio processing soundfile # Required for audio processing gguf == 0.9.1 +compressed-tensors == 0.5.0 diff --git a/requirements-test.txt b/requirements-test.txt index 62d6cc49eade4..8e003a09703ad 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -17,7 +17,7 @@ peft requests ray sentence-transformers # required for embedding -compressed-tensors==0.4.0 # required for compressed-tensors +compressed-tensors==0.5.0 # required for compressed-tensors timm # required for internvl test # TODO: Add this after fully implementing llava(mantis) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 2ea340779b819..7924d187df602 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -5,13 +5,12 @@ Run `pytest tests/quantization/test_compressed_tensors.py`. import pytest import torch +from compressed_tensors.quantization import QuantizationType from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationType) @pytest.mark.parametrize("model_args", [ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index ae75781927381..57c57d5d95c71 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,6 +1,10 @@ from typing import Any, Dict, List, Optional import torch +from compressed_tensors.config import CompressionFormat +from compressed_tensors.quantization import (QuantizationArgs, + QuantizationStrategy, + QuantizationType) from pydantic import BaseModel from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase @@ -13,8 +17,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - CompressionFormat, QuantizationArgs, QuantizationStrategy, - QuantizationType, find_matched_target, is_activation_quantization_format, + find_matched_target, is_activation_quantization_format, should_ignore_layer) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index 3d55d55cc390d..1671a23d77c63 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -1,11 +1,10 @@ from typing import Callable, List, Optional import torch +from compressed_tensors.quantization import QuantizationStrategy from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 8a3d24e2fd258..e0e1b8bad4ec3 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -1,12 +1,11 @@ from typing import Callable, List, Optional import torch +from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale) from vllm.model_executor.parameter import (ChannelQuantScaleParameter, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 078380f159291..5597dc888b7b2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -1,12 +1,11 @@ from typing import Callable, List, Optional import torch +from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( apply_int8_linear, convert_to_channelwise) from vllm.model_executor.parameter import (BasevLLMParameter, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 7912cbde5721f..a74eaef5efdee 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,85 +1,13 @@ import re -from enum import Enum -from typing import Any, Dict, Iterable, Optional +from typing import Iterable, Optional -from pydantic import BaseModel, Field +from compressed_tensors import CompressionFormat from torch.nn import Module from vllm.model_executor.layers.quantization.utils.quant_utils import ( FUSED_LAYER_NAME_MAPPING) -class CompressionFormat(Enum): - dense = "dense" - sparse_bitmask = "sparse-bitmask" - naive_quantized = "naive-quantized" - float_quantized = "float-quantized" - int_quantized = "int-quantized" - pack_quantized = "pack-quantized" - marlin_24 = "marlin-24" - - -class QuantizationType(str, Enum): - """ - Enum storing quantization type options - """ - - INT = "int" - FLOAT = "float" - - -class QuantizationStrategy(str, Enum): - """ - Enum storing quantization strategy options - """ - - TENSOR = "tensor" - CHANNEL = "channel" - GROUP = "group" - BLOCK = "block" - TOKEN = "token" - - -class QuantizationArgs(BaseModel): - """ - User facing arguments used to define a quantization config - for weights or activations - - :param num_bits: quantization bit depth - :param type: dtype to quantized to, either int or float - :param symmetric: whether or not quantization scale is symmetric - :param strategy: string determining the scope of scale/zero-point to apply - :param group_size: group length to use for the group strategy - :param block_structure: 2d block structure to use for the block - strategy, must be of the format "2x4", "8x16", etc. - :param dynamic: set True to perform dynamic quantization - - values will not be calibrated during calibration phase, - instead during inference new quantization ranges will be - observed with every sample. Defaults to False for static - quantization. Note that enabling dynamic quantization - will change the default observer to a memoryless one - """ - - num_bits: int = 8 - type: QuantizationType = QuantizationType.INT - symmetric: bool = True - group_size: Optional[int] = None - strategy: Optional[QuantizationStrategy] = None - block_structure: Optional[str] = None - dynamic: bool = False - observer: str = Field( - default="minmax", - description=("The class to use to compute the quantization param - " - "scale and zero-point'"), - ) - observer_kwargs: Dict[str, Any] = Field( - default_factory=dict, - description= - ("optional dict of kwargs to be passed directly to torch quantization " - "Observers constructor excluding quantization range or symmetry"), - ) - - def is_activation_quantization_format(format: str) -> bool: _ACTIVATION_QUANTIZATION_FORMATS = [ CompressionFormat.naive_quantized.value,