[Bugfix][Quantization] Support BF16 tensors on GGUF (#29948)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
This commit is contained in:
Tsukasa OI 2025-12-03 19:33:46 +09:00 committed by GitHub
parent cc4e296ea6
commit 42c1949643
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 18 additions and 1 deletions

View File

@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
)
QWEN3_CONFIG = GGUFTestConfig(
original_model="Qwen/Qwen3-0.6B",
gguf_repo="unsloth/Qwen3-0.6B-GGUF",
gguf_filename="Qwen3-0.6B-BF16.gguf",
)
PHI3_CONFIG = GGUFTestConfig(
original_model="microsoft/Phi-3.5-mini-instruct",
gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
MODELS = [
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
QWEN2_CONFIG,
QWEN3_CONFIG,
PHI3_CONFIG,
GPT2_CONFIG,
STABLELM_CONFIG,

View File

@ -921,7 +921,17 @@ def gguf_quant_weights_iterator(
name = gguf_to_hf_name_map[tensor.name]
if weight_type.name not in ("F32", "BF16", "F16"):
name = name.replace("weight", "qweight")
param = torch.tensor(weight)
if weight_type.name == "BF16" and tensor.data.dtype == np.uint8:
# BF16 is currently the only "quantization" type that isn't
# actually quantized but is read as a raw byte tensor.
# Reinterpret as `torch.bfloat16` tensor.
weight = weight.view(np.uint16)
if reader.byte_order == "S":
# GGUF endianness != system endianness
weight = weight.byteswap()
param = torch.tensor(weight).view(torch.bfloat16)
else:
param = torch.tensor(weight)
yield name, param