diff --git a/requirements/common.txt b/requirements/common.txt index de4b3b53166c9..a6a1ffe76196b 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -37,7 +37,7 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.9.4 # required for compressed-tensors +compressed-tensors == 0.10.0 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py watchfiles # required for http server to monitor the updates of TLS files diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 03480343d4bd8..2c07fe29fb0e6 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -651,10 +651,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): assert output -@pytest.mark.skip(reason="Skip until the model config is updated") def test_compressed_tensors_nvfp4a16(vllm_runner): # run weight only example - model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4" + model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16" with vllm_runner(model, enforce_eager=True) as llm: def check_model(model): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index dff62af863895..1ee4617e10544 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -222,15 +222,15 @@ class CompressedTensorsConfig(QuantizationConfig): input_quant: BaseModel): is_weight_only = weight_quant is not None and input_quant is None - is_group_quant = ( - weight_quant.strategy == QuantizationStrategy.GROUP.value) + is_tensor_group_quant = ( + weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value) is_symmetric = weight_quant.symmetric is_group_size_16 = weight_quant.group_size == 16 is_float_type = weight_quant.type == QuantizationType.FLOAT is_4_bits = weight_quant.num_bits == 4 - return (is_weight_only and is_group_quant and is_float_type + return (is_weight_only and is_tensor_group_quant and is_float_type and is_4_bits and is_group_size_16 and is_symmetric) def _is_static_tensor_w8a8(self, weight_quant: BaseModel,