From 6ec0d8dbe4ccff35d042fafa29f2c141e553e7ae Mon Sep 17 00:00:00 2001 From: danielafrimi <45691845+danielafrimi@users.noreply.github.com> Date: Fri, 12 Dec 2025 21:27:47 +0200 Subject: [PATCH] [Fix]Load kv-cache dtype from hf_quant_config.json automatically (#29980) Signed-off-by: Daniel Afrimi --- vllm/utils/torch_utils.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index c97efce312b56..edcb79fbc9cd7 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -194,12 +194,33 @@ def get_kv_cache_torch_dtype( return torch_dtype +def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None: + quant_method = quant_cfg.get("quant_method", "") + if quant_method.startswith("modelopt"): + quantization_inner = quant_cfg.get("quantization", quant_cfg) + # Check if quant config is specified and use kv cache quant algo + kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get( + "kv_cache_quant_algo" + ) + if isinstance(kv_algo, str): + return STR_DTYPE_TO_TORCH_DTYPE[kv_algo.lower()] + return None + + def kv_cache_dtype_str_to_dtype( kv_cache_dtype: str, model_config: ModelConfig ) -> torch.dtype: + # Model config may not be specified for unit tests, default to float16 + dtype = model_config.dtype if model_config else torch.half if kv_cache_dtype == "auto": - # Model config may not be specified for unit tests, default to float16 - return model_config.dtype if model_config else torch.half + hf_cfg = getattr(model_config, "hf_config", None) + if hf_cfg is not None: + quant_cfg = getattr(hf_cfg, "quantization_config", None) + if quant_cfg is not None: + kv_algo_dtype = get_kv_cache_quant_algo_dtype(quant_cfg) + return kv_algo_dtype if kv_algo_dtype is not None else dtype + return dtype + return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype]