Make name of compressed-tensors quant method consistent across vLLM (#17255)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-12-10 06:45:01 +08:00 · 2025-04-28 17:28:13 +01:00 · 2025-04-28 17:28:13 +01:00 · b6dd32aa07
commit b6dd32aa07
parent f94886946e
5 changed files with 10 additions and 14 deletions
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
        ("facebook/opt-125m", {}),
        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
            "dtype": torch.float16,
-            "quantization": "compressed-tensors"
        }),
        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
            "dtype": torch.float16,
-            "quantization": "compressed-tensors"
-        }),
-        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
-            "quantization": "compressed-tensors"
        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
        ("meta-llama/Llama-3.2-1B-Instruct", {}),
    ]

--- a/vllm/config.py
+++ b/vllm/config.py
@ -752,9 +752,8 @@ class ModelConfig:
        supported_quantization = QUANTIZATION_METHODS
        optimized_quantization_methods = [
            "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
-            "awq_marlin", "fbgemm_fp8", "compressed_tensors",
-            "compressed-tensors", "experts_int8", "quark", "nvfp4", "bitblas",
-            "gptq_bitblas"
+            "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
+            "quark", "nvfp4", "bitblas", "gptq_bitblas"
        ]
        if self.quantization is not None:
            self.quantization = self.quantization.lower()
@ -764,6 +763,9 @@ class ModelConfig:

        if quant_cfg is not None:
            quant_method = quant_cfg.get("quant_method", "").lower()
+            quant_method = quant_method.replace("compressed_tensors",
+                                                "compressed-tensors")
+            quant_cfg["quant_method"] = quant_method

            # Detect which checkpoint is it
            for name in QUANTIZATION_METHODS:
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@ -72,7 +72,7 @@ class CompressedTensorsConfig(QuantizationConfig):
        return 70

    def get_name(self) -> str:
-        return "compressed_tensors"
+        return "compressed-tensors"

    def get_quant_method(
        self,
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@ -130,8 +130,8 @@ class RocmPlatform(Platform):
    device_control_env_var: str = "CUDA_VISIBLE_DEVICES"

    supported_quantization: list[str] = [
-        "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
-        "fbgemm_fp8", "gguf", "quark", "ptpc_fp8"
+        "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
+        "quark", "ptpc_fp8"
    ]

    @classmethod
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@ -30,9 +30,7 @@ class TpuPlatform(Platform):
    ray_device_key: str = "TPU"
    device_control_env_var: str = "TPU_VISIBLE_CHIPS"

-    supported_quantization: list[str] = [
-        "tpu_int8", "compressed-tensors", "compressed_tensors"
-    ]
+    supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"]

    additional_env_vars: list[str] = [
        "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"