mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-20 19:57:02 +08:00
[Quantization]: Support compressed-tensors mixed-precision model loading (#22468)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
This commit is contained in:
parent
279a5f31b3
commit
4121de512e
@ -192,7 +192,15 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
quant_config.get("weights"))
|
quant_config.get("weights"))
|
||||||
|
|
||||||
target_scheme_map[target]["input_activations"] = None
|
target_scheme_map[target]["input_activations"] = None
|
||||||
if is_activation_quantization_format(quant_format):
|
target_scheme_map[target]["format"] = quant_config.get(
|
||||||
|
"format")
|
||||||
|
format = target_scheme_map[target].get("format")
|
||||||
|
# If no per-config format defined, use global format in config
|
||||||
|
act_quant_format = is_activation_quantization_format(
|
||||||
|
format
|
||||||
|
) if format is not None else is_activation_quantization_format(
|
||||||
|
quant_format)
|
||||||
|
if act_quant_format:
|
||||||
input_activations = quant_config.get("input_activations")
|
input_activations = quant_config.get("input_activations")
|
||||||
# The only case where we have activation quant supported
|
# The only case where we have activation quant supported
|
||||||
# but no input_activations provided in the config
|
# but no input_activations provided in the config
|
||||||
@ -389,8 +397,10 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
return (is_channel_group and input_quant_none and is_static)
|
return (is_channel_group and input_quant_none and is_static)
|
||||||
|
|
||||||
def _get_scheme_from_parts(
|
def _get_scheme_from_parts(
|
||||||
self, weight_quant: BaseModel,
|
self,
|
||||||
input_quant: BaseModel) -> "CompressedTensorsScheme":
|
weight_quant: BaseModel,
|
||||||
|
input_quant: BaseModel,
|
||||||
|
format: Optional[str] = None) -> "CompressedTensorsScheme":
|
||||||
# Detect If Mixed Precision
|
# Detect If Mixed Precision
|
||||||
if self._is_fp4a16_nvfp4(weight_quant, input_quant):
|
if self._is_fp4a16_nvfp4(weight_quant, input_quant):
|
||||||
return CompressedTensorsW4A16Fp4()
|
return CompressedTensorsW4A16Fp4()
|
||||||
@ -412,7 +422,11 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
group_size=weight_quant.group_size,
|
group_size=weight_quant.group_size,
|
||||||
actorder=weight_quant.actorder)
|
actorder=weight_quant.actorder)
|
||||||
|
|
||||||
if is_activation_quantization_format(self.quant_format):
|
act_quant_format = is_activation_quantization_format(
|
||||||
|
format
|
||||||
|
) if format is not None else is_activation_quantization_format(
|
||||||
|
self.quant_format)
|
||||||
|
if act_quant_format:
|
||||||
if self._is_fp4a4_nvfp4(weight_quant, input_quant):
|
if self._is_fp4a4_nvfp4(weight_quant, input_quant):
|
||||||
if cutlass_fp4_supported(
|
if cutlass_fp4_supported(
|
||||||
) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
|
) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
|
||||||
@ -507,6 +521,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
scheme_dict = self.target_scheme_map[matched_target]
|
scheme_dict = self.target_scheme_map[matched_target]
|
||||||
weight_quant = scheme_dict.get("weights")
|
weight_quant = scheme_dict.get("weights")
|
||||||
input_quant = scheme_dict.get("input_activations")
|
input_quant = scheme_dict.get("input_activations")
|
||||||
|
format = scheme_dict.get("format")
|
||||||
|
|
||||||
# Find the sparsity scheme of the layer
|
# Find the sparsity scheme of the layer
|
||||||
# assume that fused layers inerhit first component's sparsity scheme
|
# assume that fused layers inerhit first component's sparsity scheme
|
||||||
@ -547,7 +562,7 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
scheme = self._get_scheme_from_parts( # type: ignore
|
scheme = self._get_scheme_from_parts( # type: ignore
|
||||||
weight_quant=weight_quant,
|
weight_quant=weight_quant,
|
||||||
input_quant=input_quant,
|
input_quant=input_quant,
|
||||||
)
|
format=format)
|
||||||
|
|
||||||
# Raise error if device does not support the scheme
|
# Raise error if device does not support the scheme
|
||||||
# (e.g. fp8 needs ada lovelace)
|
# (e.g. fp8 needs ada lovelace)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user