[Quantization] Bump compressed-tensors version; update NVFP4A16 test model (#19224)

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
This commit is contained in:
Dipika Sikka 2025-06-06 04:21:54 -04:00 committed by GitHub
parent 0d49483ea9
commit 94870359cd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 5 additions and 6 deletions

View File

@ -37,7 +37,7 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.9.4 # required for compressed-tensors
compressed-tensors == 0.10.0 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files

View File

@ -651,10 +651,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
assert output
@pytest.mark.skip(reason="Skip until the model config is updated")
def test_compressed_tensors_nvfp4a16(vllm_runner):
# run weight only example
model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16"
with vllm_runner(model, enforce_eager=True) as llm:
def check_model(model):

View File

@ -222,15 +222,15 @@ class CompressedTensorsConfig(QuantizationConfig):
input_quant: BaseModel):
is_weight_only = weight_quant is not None and input_quant is None
is_group_quant = (
weight_quant.strategy == QuantizationStrategy.GROUP.value)
is_tensor_group_quant = (
weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value)
is_symmetric = weight_quant.symmetric
is_group_size_16 = weight_quant.group_size == 16
is_float_type = weight_quant.type == QuantizationType.FLOAT
is_4_bits = weight_quant.num_bits == 4
return (is_weight_only and is_group_quant and is_float_type
return (is_weight_only and is_tensor_group_quant and is_float_type
and is_4_bits and is_group_size_16 and is_symmetric)
def _is_static_tensor_w8a8(self, weight_quant: BaseModel,