From 4121de512ea44bd2ad4fae22c96c437598be6b62 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 14 Aug 2025 17:32:09 -0400 Subject: [PATCH] [Quantization]: Support compressed-tensors mixed-precision model loading (#22468) Signed-off-by: Dipika Sikka --- .../compressed_tensors/compressed_tensors.py | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 69bced7c0b8ec..637a84372990a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -192,7 +192,15 @@ class CompressedTensorsConfig(QuantizationConfig): quant_config.get("weights")) target_scheme_map[target]["input_activations"] = None - if is_activation_quantization_format(quant_format): + target_scheme_map[target]["format"] = quant_config.get( + "format") + format = target_scheme_map[target].get("format") + # If no per-config format defined, use global format in config + act_quant_format = is_activation_quantization_format( + format + ) if format is not None else is_activation_quantization_format( + quant_format) + if act_quant_format: input_activations = quant_config.get("input_activations") # The only case where we have activation quant supported # but no input_activations provided in the config @@ -389,8 +397,10 @@ class CompressedTensorsConfig(QuantizationConfig): return (is_channel_group and input_quant_none and is_static) def _get_scheme_from_parts( - self, weight_quant: BaseModel, - input_quant: BaseModel) -> "CompressedTensorsScheme": + self, + weight_quant: BaseModel, + input_quant: BaseModel, + format: Optional[str] = None) -> "CompressedTensorsScheme": # Detect If Mixed Precision if self._is_fp4a16_nvfp4(weight_quant, input_quant): return CompressedTensorsW4A16Fp4() @@ -412,7 +422,11 @@ class CompressedTensorsConfig(QuantizationConfig): group_size=weight_quant.group_size, actorder=weight_quant.actorder) - if is_activation_quantization_format(self.quant_format): + act_quant_format = is_activation_quantization_format( + format + ) if format is not None else is_activation_quantization_format( + self.quant_format) + if act_quant_format: if self._is_fp4a4_nvfp4(weight_quant, input_quant): if cutlass_fp4_supported( ) or envs.VLLM_USE_NVFP4_CT_EMULATIONS: @@ -507,6 +521,7 @@ class CompressedTensorsConfig(QuantizationConfig): scheme_dict = self.target_scheme_map[matched_target] weight_quant = scheme_dict.get("weights") input_quant = scheme_dict.get("input_activations") + format = scheme_dict.get("format") # Find the sparsity scheme of the layer # assume that fused layers inerhit first component's sparsity scheme @@ -547,7 +562,7 @@ class CompressedTensorsConfig(QuantizationConfig): scheme = self._get_scheme_from_parts( # type: ignore weight_quant=weight_quant, input_quant=input_quant, - ) + format=format) # Raise error if device does not support the scheme # (e.g. fp8 needs ada lovelace)