mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 11:45:59 +08:00
[Quantization] Bump compressed-tensors version; update NVFP4A16 test model (#19224)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
This commit is contained in:
parent
0d49483ea9
commit
94870359cd
@ -37,7 +37,7 @@ pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||
einops # Required for Qwen2-VL.
|
||||
compressed-tensors == 0.9.4 # required for compressed-tensors
|
||||
compressed-tensors == 0.10.0 # required for compressed-tensors
|
||||
depyf==0.18.0 # required for profiling and debugging with compilation config
|
||||
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
||||
watchfiles # required for http server to monitor the updates of TLS files
|
||||
|
||||
@ -651,10 +651,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Skip until the model config is updated")
|
||||
def test_compressed_tensors_nvfp4a16(vllm_runner):
|
||||
# run weight only example
|
||||
model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
|
||||
model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16"
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
|
||||
@ -222,15 +222,15 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
input_quant: BaseModel):
|
||||
|
||||
is_weight_only = weight_quant is not None and input_quant is None
|
||||
is_group_quant = (
|
||||
weight_quant.strategy == QuantizationStrategy.GROUP.value)
|
||||
is_tensor_group_quant = (
|
||||
weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value)
|
||||
is_symmetric = weight_quant.symmetric
|
||||
|
||||
is_group_size_16 = weight_quant.group_size == 16
|
||||
is_float_type = weight_quant.type == QuantizationType.FLOAT
|
||||
is_4_bits = weight_quant.num_bits == 4
|
||||
|
||||
return (is_weight_only and is_group_quant and is_float_type
|
||||
return (is_weight_only and is_tensor_group_quant and is_float_type
|
||||
and is_4_bits and is_group_size_16 and is_symmetric)
|
||||
|
||||
def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user