From dc13c99eedf837f22f60e9b5836abf147f5254f1 Mon Sep 17 00:00:00 2001 From: Christina Norman Date: Fri, 12 Dec 2025 09:10:12 -0600 Subject: [PATCH] fix(gguf): Disable bfloat16 for GGUF on blackwell device (#30408) Signed-off-by: Christina Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Christina Norman Co-authored-by: Isotr0py Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/quantization/gguf.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 13aa2bcad21ba..9dd734f2fea6a 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -33,6 +33,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ) from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform from vllm.utils.torch_utils import direct_register_custom_op logger = init_logger(__name__) @@ -52,6 +53,11 @@ class GGUFConfig(QuantizationConfig): return "gguf" def get_supported_act_dtypes(self) -> list[torch.dtype]: + # GGUF dequantization kernels use half precision (fp16) internally. + # bfloat16 has precision issues on Blackwell devices. + if current_platform.has_device_capability(100): + logger.warning_once("GGUF has precision issues with bfloat16 on Blackwell.") + return [torch.half, torch.float32] return [torch.half, torch.bfloat16, torch.float32] @classmethod