mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 03:54:56 +08:00
Make name of compressed-tensors quant method consistent across vLLM (#17255)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
f94886946e
commit
b6dd32aa07
@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
|
||||
("facebook/opt-125m", {}),
|
||||
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
||||
"dtype": torch.float16,
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
|
||||
"dtype": torch.float16,
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
||||
]
|
||||
|
||||
|
||||
@ -752,9 +752,8 @@ class ModelConfig:
|
||||
supported_quantization = QUANTIZATION_METHODS
|
||||
optimized_quantization_methods = [
|
||||
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
|
||||
"awq_marlin", "fbgemm_fp8", "compressed_tensors",
|
||||
"compressed-tensors", "experts_int8", "quark", "nvfp4", "bitblas",
|
||||
"gptq_bitblas"
|
||||
"awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
|
||||
"quark", "nvfp4", "bitblas", "gptq_bitblas"
|
||||
]
|
||||
if self.quantization is not None:
|
||||
self.quantization = self.quantization.lower()
|
||||
@ -764,6 +763,9 @@ class ModelConfig:
|
||||
|
||||
if quant_cfg is not None:
|
||||
quant_method = quant_cfg.get("quant_method", "").lower()
|
||||
quant_method = quant_method.replace("compressed_tensors",
|
||||
"compressed-tensors")
|
||||
quant_cfg["quant_method"] = quant_method
|
||||
|
||||
# Detect which checkpoint is it
|
||||
for name in QUANTIZATION_METHODS:
|
||||
|
||||
@ -72,7 +72,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
return 70
|
||||
|
||||
def get_name(self) -> str:
|
||||
return "compressed_tensors"
|
||||
return "compressed-tensors"
|
||||
|
||||
def get_quant_method(
|
||||
self,
|
||||
|
||||
@ -130,8 +130,8 @@ class RocmPlatform(Platform):
|
||||
device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
|
||||
|
||||
supported_quantization: list[str] = [
|
||||
"awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
|
||||
"fbgemm_fp8", "gguf", "quark", "ptpc_fp8"
|
||||
"awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
|
||||
"quark", "ptpc_fp8"
|
||||
]
|
||||
|
||||
@classmethod
|
||||
|
||||
@ -30,9 +30,7 @@ class TpuPlatform(Platform):
|
||||
ray_device_key: str = "TPU"
|
||||
device_control_env_var: str = "TPU_VISIBLE_CHIPS"
|
||||
|
||||
supported_quantization: list[str] = [
|
||||
"tpu_int8", "compressed-tensors", "compressed_tensors"
|
||||
]
|
||||
supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"]
|
||||
|
||||
additional_env_vars: list[str] = [
|
||||
"TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user