mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 14:56:16 +08:00
Signed-off-by: Injae Ryou <injaeryou@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
209 lines
7.3 KiB
Python
209 lines
7.3 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""GGUF utility functions."""
|
|
|
|
from pathlib import Path
|
|
|
|
import gguf
|
|
from gguf.constants import Keys, VisionProjectorType
|
|
from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
|
|
|
|
from vllm.logger import init_logger
|
|
from vllm.transformers_utils.config import list_filtered_repo_files
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
|
|
def detect_gguf_multimodal(model: str) -> Path | None:
|
|
"""Check if GGUF model has multimodal projector file.
|
|
|
|
Args:
|
|
model: Model path string
|
|
|
|
Returns:
|
|
Path to mmproj file if found, None otherwise
|
|
"""
|
|
if not model.endswith(".gguf"):
|
|
return None
|
|
|
|
try:
|
|
model_path = Path(model)
|
|
if not model_path.is_file():
|
|
return None
|
|
|
|
model_dir = model_path.parent
|
|
mmproj_patterns = ["mmproj.gguf", "mmproj-*.gguf", "*mmproj*.gguf"]
|
|
for pattern in mmproj_patterns:
|
|
mmproj_files = list(model_dir.glob(pattern))
|
|
if mmproj_files:
|
|
return mmproj_files[0]
|
|
return None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def extract_vision_config_from_gguf(mmproj_path: str) -> "SiglipVisionConfig | None":
|
|
"""Extract vision config parameters from mmproj.gguf metadata.
|
|
|
|
Reads vision encoder configuration from GGUF metadata fields using
|
|
standardized GGUF constants. Automatically detects the projector type
|
|
(e.g., gemma3, llama4) and applies model-specific parameters accordingly.
|
|
|
|
The function extracts standard CLIP vision parameters from GGUF metadata
|
|
and applies projector-type-specific customizations. For unknown projector
|
|
types, it uses safe defaults from SiglipVisionConfig.
|
|
|
|
Args:
|
|
mmproj_path: Path to mmproj.gguf file (str or Path)
|
|
|
|
Returns:
|
|
SiglipVisionConfig if extraction succeeds, None if any required
|
|
field is missing from the GGUF metadata
|
|
|
|
Raises:
|
|
Exception: Exceptions from GGUF reading (file not found, corrupted
|
|
file, etc.) propagate directly from gguf.GGUFReader
|
|
"""
|
|
reader = gguf.GGUFReader(str(mmproj_path))
|
|
|
|
# Detect projector type to apply model-specific parameters
|
|
projector_type = None
|
|
projector_type_field = reader.get_field(Keys.Clip.PROJECTOR_TYPE)
|
|
if projector_type_field:
|
|
try:
|
|
projector_type = bytes(projector_type_field.parts[-1]).decode("utf-8")
|
|
except (AttributeError, UnicodeDecodeError) as e:
|
|
logger.warning("Failed to decode projector type from GGUF: %s", e)
|
|
|
|
# Map GGUF field constants to SiglipVisionConfig parameters.
|
|
# Uses official GGUF constants from gguf-py for standardization.
|
|
# Format: {gguf_constant: (param_name, dtype)}
|
|
VISION_CONFIG_FIELDS = {
|
|
Keys.ClipVision.EMBEDDING_LENGTH: ("hidden_size", int),
|
|
Keys.ClipVision.FEED_FORWARD_LENGTH: ("intermediate_size", int),
|
|
Keys.ClipVision.BLOCK_COUNT: ("num_hidden_layers", int),
|
|
Keys.ClipVision.Attention.HEAD_COUNT: ("num_attention_heads", int),
|
|
Keys.ClipVision.IMAGE_SIZE: ("image_size", int),
|
|
Keys.ClipVision.PATCH_SIZE: ("patch_size", int),
|
|
Keys.ClipVision.Attention.LAYERNORM_EPS: ("layer_norm_eps", float),
|
|
}
|
|
|
|
# Extract and validate all required fields
|
|
config_params = {}
|
|
for gguf_key, (param_name, dtype) in VISION_CONFIG_FIELDS.items():
|
|
field = reader.get_field(gguf_key)
|
|
if field is None:
|
|
logger.warning(
|
|
"Missing required vision config field '%s' in mmproj.gguf",
|
|
gguf_key,
|
|
)
|
|
return None
|
|
# Extract scalar value from GGUF field and convert to target type
|
|
config_params[param_name] = dtype(field.parts[-1])
|
|
|
|
# Apply model-specific parameters based on projector type
|
|
if projector_type == VisionProjectorType.GEMMA3:
|
|
# Gemma3 doesn't use the vision pooling head (multihead attention)
|
|
# This is a vLLM-specific parameter used in SiglipVisionTransformer
|
|
config_params["vision_use_head"] = False
|
|
logger.info("Detected Gemma3 projector, disabling vision pooling head")
|
|
# Add other projector-type-specific customizations here as needed
|
|
# elif projector_type == VisionProjectorType.LLAMA4:
|
|
# config_params["vision_use_head"] = ...
|
|
|
|
# Create config with extracted parameters
|
|
# Note: num_channels and attention_dropout use SiglipVisionConfig defaults
|
|
# (3 and 0.0 respectively) which are correct for all models
|
|
config = SiglipVisionConfig(**config_params)
|
|
|
|
if projector_type:
|
|
logger.info(
|
|
"Extracted vision config from mmproj.gguf (projector_type: %s)",
|
|
projector_type,
|
|
)
|
|
else:
|
|
logger.info("Extracted vision config from mmproj.gguf metadata")
|
|
|
|
return config
|
|
|
|
|
|
def maybe_patch_hf_config_from_gguf(
|
|
model: str,
|
|
hf_config: PretrainedConfig,
|
|
) -> PretrainedConfig:
|
|
"""Patch HF config for GGUF models.
|
|
|
|
Applies GGUF-specific patches to HuggingFace config:
|
|
1. For multimodal models: patches architecture and vision config
|
|
2. For all GGUF models: overrides vocab_size from embedding tensor
|
|
|
|
This ensures compatibility with GGUF models that have extended
|
|
vocabularies (e.g., Unsloth) where the GGUF file contains more
|
|
tokens than the HuggingFace tokenizer config specifies.
|
|
|
|
Args:
|
|
model: Model path string
|
|
hf_config: HuggingFace config to patch in-place
|
|
|
|
Returns:
|
|
Updated HuggingFace config
|
|
"""
|
|
# Patch multimodal config if mmproj.gguf exists
|
|
mmproj_path = detect_gguf_multimodal(model)
|
|
if mmproj_path is not None:
|
|
vision_config = extract_vision_config_from_gguf(str(mmproj_path))
|
|
|
|
# Create HF config for Gemma3 multimodal
|
|
text_config = hf_config.get_text_config()
|
|
is_gemma3 = hf_config.model_type in ("gemma3", "gemma3_text")
|
|
if vision_config is not None and is_gemma3:
|
|
new_hf_config = Gemma3Config.from_text_vision_configs(
|
|
text_config=text_config,
|
|
vision_config=vision_config,
|
|
architectures=["Gemma3ForConditionalGeneration"],
|
|
)
|
|
hf_config = new_hf_config
|
|
|
|
return hf_config
|
|
|
|
|
|
def get_gguf_file_path_from_hf(
|
|
repo_id: str | Path,
|
|
quant_type: str,
|
|
revision: str | None = None,
|
|
) -> str:
|
|
"""Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type.
|
|
|
|
Args:
|
|
repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B")
|
|
quant_type: The quantization type (e.g., "Q4_K_M", "F16")
|
|
revision: Optional revision/branch name
|
|
|
|
Returns:
|
|
The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"),
|
|
"""
|
|
repo_id = str(repo_id)
|
|
gguf_patterns = [
|
|
f"*-{quant_type}.gguf",
|
|
f"*-{quant_type}-*.gguf",
|
|
f"*/*-{quant_type}.gguf",
|
|
f"*/*-{quant_type}-*.gguf",
|
|
]
|
|
matching_files = list_filtered_repo_files(
|
|
repo_id,
|
|
allow_patterns=gguf_patterns,
|
|
revision=revision,
|
|
)
|
|
|
|
if len(matching_files) == 0:
|
|
raise ValueError(
|
|
"Could not find GGUF file for repo %s with quantization %s.",
|
|
repo_id,
|
|
quant_type,
|
|
)
|
|
|
|
# Sort to ensure consistent ordering (prefer non-sharded files)
|
|
matching_files.sort(key=lambda x: (x.count("-"), x))
|
|
gguf_filename = matching_files[0]
|
|
return gguf_filename
|