From 42c194964341bea9fc59e0d35db04dfafc3c473d Mon Sep 17 00:00:00 2001 From: Tsukasa OI Date: Wed, 3 Dec 2025 19:33:46 +0900 Subject: [PATCH] [Bugfix][Quantization] Support BF16 tensors on GGUF (#29948) Signed-off-by: Tsukasa OI --- tests/models/quantization/test_gguf.py | 7 +++++++ vllm/model_executor/model_loader/weight_utils.py | 12 +++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py index 3b9597507ac1..064ca94f3cba 100644 --- a/tests/models/quantization/test_gguf.py +++ b/tests/models/quantization/test_gguf.py @@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig( gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf", ) +QWEN3_CONFIG = GGUFTestConfig( + original_model="Qwen/Qwen3-0.6B", + gguf_repo="unsloth/Qwen3-0.6B-GGUF", + gguf_filename="Qwen3-0.6B-BF16.gguf", +) + PHI3_CONFIG = GGUFTestConfig( original_model="microsoft/Phi-3.5-mini-instruct", gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF", @@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig( MODELS = [ # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458 QWEN2_CONFIG, + QWEN3_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG, diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 0809bdfa9d4c..0496b7a84507 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -921,7 +921,17 @@ def gguf_quant_weights_iterator( name = gguf_to_hf_name_map[tensor.name] if weight_type.name not in ("F32", "BF16", "F16"): name = name.replace("weight", "qweight") - param = torch.tensor(weight) + if weight_type.name == "BF16" and tensor.data.dtype == np.uint8: + # BF16 is currently the only "quantization" type that isn't + # actually quantized but is read as a raw byte tensor. + # Reinterpret as `torch.bfloat16` tensor. + weight = weight.view(np.uint16) + if reader.byte_order == "S": + # GGUF endianness != system endianness + weight = weight.byteswap() + param = torch.tensor(weight).view(torch.bfloat16) + else: + param = torch.tensor(weight) yield name, param