From 42c194964341bea9fc59e0d35db04dfafc3c473d Mon Sep 17 00:00:00 2001
From: Tsukasa OI <floss_llm@irq.a4lg.com>
Date: Wed, 3 Dec 2025 19:33:46 +0900
Subject: [PATCH] [Bugfix][Quantization] Support BF16 tensors on GGUF (#29948)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
---
 tests/models/quantization/test_gguf.py           |  7 +++++++
 vllm/model_executor/model_loader/weight_utils.py | 12 +++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 3b9597507ac1..064ca94f3cba 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
     gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
 )
 
+QWEN3_CONFIG = GGUFTestConfig(
+    original_model="Qwen/Qwen3-0.6B",
+    gguf_repo="unsloth/Qwen3-0.6B-GGUF",
+    gguf_filename="Qwen3-0.6B-BF16.gguf",
+)
+
 PHI3_CONFIG = GGUFTestConfig(
     original_model="microsoft/Phi-3.5-mini-instruct",
     gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
@@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
 MODELS = [
     # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
     QWEN2_CONFIG,
+    QWEN3_CONFIG,
     PHI3_CONFIG,
     GPT2_CONFIG,
     STABLELM_CONFIG,
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 0809bdfa9d4c..0496b7a84507 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -921,7 +921,17 @@ def gguf_quant_weights_iterator(
             name = gguf_to_hf_name_map[tensor.name]
             if weight_type.name not in ("F32", "BF16", "F16"):
                 name = name.replace("weight", "qweight")
-            param = torch.tensor(weight)
+            if weight_type.name == "BF16" and tensor.data.dtype == np.uint8:
+                # BF16 is currently the only "quantization" type that isn't
+                # actually quantized but is read as a raw byte tensor.
+                # Reinterpret as `torch.bfloat16` tensor.
+                weight = weight.view(np.uint16)
+                if reader.byte_order == "S":
+                    # GGUF endianness != system endianness
+                    weight = weight.byteswap()
+                param = torch.tensor(weight).view(torch.bfloat16)
+            else:
+                param = torch.tensor(weight)
             yield name, param