[Quantization] Bump compressed-tensors version; update NVFP4A16 test model (#19224)

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
2026-01-28 08:47:13 +08:00 · 2025-06-06 04:21:54 -04:00 · 2025-06-06 04:21:54 -04:00 · 94870359cd
commit 94870359cd
parent 0d49483ea9
3 changed files with 5 additions and 6 deletions
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.4 # required for compressed-tensors
+compressed-tensors == 0.10.0 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@ -651,10 +651,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
        assert output


-@pytest.mark.skip(reason="Skip until the model config is updated")
 def test_compressed_tensors_nvfp4a16(vllm_runner):
    # run weight only example
-    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
+    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16"
    with vllm_runner(model, enforce_eager=True) as llm:

        def check_model(model):
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@ -222,15 +222,15 @@ class CompressedTensorsConfig(QuantizationConfig):
                         input_quant: BaseModel):

        is_weight_only = weight_quant is not None and input_quant is None
-        is_group_quant = (
-            weight_quant.strategy == QuantizationStrategy.GROUP.value)
+        is_tensor_group_quant = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value)
        is_symmetric = weight_quant.symmetric

        is_group_size_16 = weight_quant.group_size == 16
        is_float_type = weight_quant.type == QuantizationType.FLOAT
        is_4_bits = weight_quant.num_bits == 4

-        return (is_weight_only and is_group_quant and is_float_type
+        return (is_weight_only and is_tensor_group_quant and is_float_type
                and is_4_bits and is_group_size_16 and is_symmetric)

    def _is_static_tensor_w8a8(self, weight_quant: BaseModel,