mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 10:34:58 +08:00
[Bugfix][Quantization] Support BF16 tensors on GGUF (#29948)
Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
This commit is contained in:
parent
cc4e296ea6
commit
42c1949643
@ -47,6 +47,12 @@ QWEN2_CONFIG = GGUFTestConfig(
|
||||
gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
|
||||
)
|
||||
|
||||
QWEN3_CONFIG = GGUFTestConfig(
|
||||
original_model="Qwen/Qwen3-0.6B",
|
||||
gguf_repo="unsloth/Qwen3-0.6B-GGUF",
|
||||
gguf_filename="Qwen3-0.6B-BF16.gguf",
|
||||
)
|
||||
|
||||
PHI3_CONFIG = GGUFTestConfig(
|
||||
original_model="microsoft/Phi-3.5-mini-instruct",
|
||||
gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
|
||||
@ -87,6 +93,7 @@ GEMMA3_CONFIG = GGUFTestConfig(
|
||||
MODELS = [
|
||||
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
|
||||
QWEN2_CONFIG,
|
||||
QWEN3_CONFIG,
|
||||
PHI3_CONFIG,
|
||||
GPT2_CONFIG,
|
||||
STABLELM_CONFIG,
|
||||
|
||||
@ -921,7 +921,17 @@ def gguf_quant_weights_iterator(
|
||||
name = gguf_to_hf_name_map[tensor.name]
|
||||
if weight_type.name not in ("F32", "BF16", "F16"):
|
||||
name = name.replace("weight", "qweight")
|
||||
param = torch.tensor(weight)
|
||||
if weight_type.name == "BF16" and tensor.data.dtype == np.uint8:
|
||||
# BF16 is currently the only "quantization" type that isn't
|
||||
# actually quantized but is read as a raw byte tensor.
|
||||
# Reinterpret as `torch.bfloat16` tensor.
|
||||
weight = weight.view(np.uint16)
|
||||
if reader.byte_order == "S":
|
||||
# GGUF endianness != system endianness
|
||||
weight = weight.byteswap()
|
||||
param = torch.tensor(weight).view(torch.bfloat16)
|
||||
else:
|
||||
param = torch.tensor(weight)
|
||||
yield name, param
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user