mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-01 01:15:16 +08:00
[Quantization]: Support compressed-tensors mixed-precision model loading (#22468)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
This commit is contained in:
parent
279a5f31b3
commit
4121de512e
@ -192,7 +192,15 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
quant_config.get("weights"))
|
||||
|
||||
target_scheme_map[target]["input_activations"] = None
|
||||
if is_activation_quantization_format(quant_format):
|
||||
target_scheme_map[target]["format"] = quant_config.get(
|
||||
"format")
|
||||
format = target_scheme_map[target].get("format")
|
||||
# If no per-config format defined, use global format in config
|
||||
act_quant_format = is_activation_quantization_format(
|
||||
format
|
||||
) if format is not None else is_activation_quantization_format(
|
||||
quant_format)
|
||||
if act_quant_format:
|
||||
input_activations = quant_config.get("input_activations")
|
||||
# The only case where we have activation quant supported
|
||||
# but no input_activations provided in the config
|
||||
@ -389,8 +397,10 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
return (is_channel_group and input_quant_none and is_static)
|
||||
|
||||
def _get_scheme_from_parts(
|
||||
self, weight_quant: BaseModel,
|
||||
input_quant: BaseModel) -> "CompressedTensorsScheme":
|
||||
self,
|
||||
weight_quant: BaseModel,
|
||||
input_quant: BaseModel,
|
||||
format: Optional[str] = None) -> "CompressedTensorsScheme":
|
||||
# Detect If Mixed Precision
|
||||
if self._is_fp4a16_nvfp4(weight_quant, input_quant):
|
||||
return CompressedTensorsW4A16Fp4()
|
||||
@ -412,7 +422,11 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
group_size=weight_quant.group_size,
|
||||
actorder=weight_quant.actorder)
|
||||
|
||||
if is_activation_quantization_format(self.quant_format):
|
||||
act_quant_format = is_activation_quantization_format(
|
||||
format
|
||||
) if format is not None else is_activation_quantization_format(
|
||||
self.quant_format)
|
||||
if act_quant_format:
|
||||
if self._is_fp4a4_nvfp4(weight_quant, input_quant):
|
||||
if cutlass_fp4_supported(
|
||||
) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
|
||||
@ -507,6 +521,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
scheme_dict = self.target_scheme_map[matched_target]
|
||||
weight_quant = scheme_dict.get("weights")
|
||||
input_quant = scheme_dict.get("input_activations")
|
||||
format = scheme_dict.get("format")
|
||||
|
||||
# Find the sparsity scheme of the layer
|
||||
# assume that fused layers inerhit first component's sparsity scheme
|
||||
@ -547,7 +562,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
||||
scheme = self._get_scheme_from_parts( # type: ignore
|
||||
weight_quant=weight_quant,
|
||||
input_quant=input_quant,
|
||||
)
|
||||
format=format)
|
||||
|
||||
# Raise error if device does not support the scheme
|
||||
# (e.g. fp8 needs ada lovelace)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user