mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 11:06:15 +08:00
Make name of compressed-tensors quant method consistent across vLLM (#17255)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
f94886946e
commit
b6dd32aa07
@ -20,15 +20,11 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
|
|||||||
("facebook/opt-125m", {}),
|
("facebook/opt-125m", {}),
|
||||||
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
||||||
"dtype": torch.float16,
|
"dtype": torch.float16,
|
||||||
"quantization": "compressed-tensors"
|
|
||||||
}),
|
}),
|
||||||
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
|
("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
|
||||||
"dtype": torch.float16,
|
"dtype": torch.float16,
|
||||||
"quantization": "compressed-tensors"
|
|
||||||
}),
|
|
||||||
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
|
|
||||||
"quantization": "compressed-tensors"
|
|
||||||
}),
|
}),
|
||||||
|
("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {}),
|
||||||
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -752,9 +752,8 @@ class ModelConfig:
|
|||||||
supported_quantization = QUANTIZATION_METHODS
|
supported_quantization = QUANTIZATION_METHODS
|
||||||
optimized_quantization_methods = [
|
optimized_quantization_methods = [
|
||||||
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
|
"fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
|
||||||
"awq_marlin", "fbgemm_fp8", "compressed_tensors",
|
"awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
|
||||||
"compressed-tensors", "experts_int8", "quark", "nvfp4", "bitblas",
|
"quark", "nvfp4", "bitblas", "gptq_bitblas"
|
||||||
"gptq_bitblas"
|
|
||||||
]
|
]
|
||||||
if self.quantization is not None:
|
if self.quantization is not None:
|
||||||
self.quantization = self.quantization.lower()
|
self.quantization = self.quantization.lower()
|
||||||
@ -764,6 +763,9 @@ class ModelConfig:
|
|||||||
|
|
||||||
if quant_cfg is not None:
|
if quant_cfg is not None:
|
||||||
quant_method = quant_cfg.get("quant_method", "").lower()
|
quant_method = quant_cfg.get("quant_method", "").lower()
|
||||||
|
quant_method = quant_method.replace("compressed_tensors",
|
||||||
|
"compressed-tensors")
|
||||||
|
quant_cfg["quant_method"] = quant_method
|
||||||
|
|
||||||
# Detect which checkpoint is it
|
# Detect which checkpoint is it
|
||||||
for name in QUANTIZATION_METHODS:
|
for name in QUANTIZATION_METHODS:
|
||||||
|
|||||||
@ -72,7 +72,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
return 70
|
return 70
|
||||||
|
|
||||||
def get_name(self) -> str:
|
def get_name(self) -> str:
|
||||||
return "compressed_tensors"
|
return "compressed-tensors"
|
||||||
|
|
||||||
def get_quant_method(
|
def get_quant_method(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -130,8 +130,8 @@ class RocmPlatform(Platform):
|
|||||||
device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
|
device_control_env_var: str = "CUDA_VISIBLE_DEVICES"
|
||||||
|
|
||||||
supported_quantization: list[str] = [
|
supported_quantization: list[str] = [
|
||||||
"awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
|
"awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8", "gguf",
|
||||||
"fbgemm_fp8", "gguf", "quark", "ptpc_fp8"
|
"quark", "ptpc_fp8"
|
||||||
]
|
]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@ -30,9 +30,7 @@ class TpuPlatform(Platform):
|
|||||||
ray_device_key: str = "TPU"
|
ray_device_key: str = "TPU"
|
||||||
device_control_env_var: str = "TPU_VISIBLE_CHIPS"
|
device_control_env_var: str = "TPU_VISIBLE_CHIPS"
|
||||||
|
|
||||||
supported_quantization: list[str] = [
|
supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"]
|
||||||
"tpu_int8", "compressed-tensors", "compressed_tensors"
|
|
||||||
]
|
|
||||||
|
|
||||||
additional_env_vars: list[str] = [
|
additional_env_vars: list[str] = [
|
||||||
"TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
|
"TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user