Make name of compressed-tensors quant method consistent across vLLM (#17255)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-04-28 17:28:13 +01:00 committed by GitHub
parent f94886946e
commit b6dd32aa07
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 10 additions and 14 deletions

View File

@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
"quantization": "compressed-tensors"
}),
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
("meta-llama/Llama-3.2-1B-Instruct", {}),
]

View File

@ -752,9 +752,8 @@ class ModelConfig:
supported_quantization = QUANTIZATION_METHODS
optimized_quantization_methods = [
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
"awq_marlin", "fbgemm_fp8", "compressed_tensors",
"compressed-tensors", "experts_int8", "quark", "nvfp4", "bitblas",
"gptq_bitblas"
"awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
"quark", "nvfp4", "bitblas", "gptq_bitblas"
]
if self.quantization is not None:
self.quantization = self.quantization.lower()
@ -764,6 +763,9 @@ class ModelConfig:
if quant_cfg is not None:
quant_method = quant_cfg.get("quant_method", "").lower()
quant_method = quant_method.replace("compressed_tensors",
"compressed-tensors")
quant_cfg["quant_method"] = quant_method
# Detect which checkpoint is it
for name in QUANTIZATION_METHODS:

View File

@ -72,7 +72,7 @@ class CompressedTensorsConfig(QuantizationConfig):
return 70
def get_name(self) -> str:
return "compressed_tensors"
return "compressed-tensors"
def get_quant_method(
self,

View File

@ -130,8 +130,8 @@ class RocmPlatform(Platform):
device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
supported_quantization: list[str] = [
"awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
"fbgemm_fp8", "gguf", "quark", "ptpc_fp8"
"awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
"quark", "ptpc_fp8"
]
@classmethod

View File

@ -30,9 +30,7 @@ class TpuPlatform(Platform):
ray_device_key: str = "TPU"
device_control_env_var: str = "TPU_VISIBLE_CHIPS"
supported_quantization: list[str] = [
"tpu_int8", "compressed-tensors", "compressed_tensors"
]
supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"]
additional_env_vars: list[str] = [
"TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"