[Bugfix] Fix prompt format of GLM4V (#14539)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-03-13 19:37:17 +08:00 committed by GitHub
parent b1cc4dfef5
commit f53a0586b9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 43 additions and 19 deletions

View File

@ -254,13 +254,21 @@ VLM_TEST_SETTINGS = {
"glm4v": VLMTestInfo(
models=["THUDM/glm-4v-9b"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
img_idx_to_prompt=lambda idx: "",
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
}),
max_model_len=2048,
max_num_seqs=2,
dtype="bfloat16",
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
patch_hf_runner=model_utils.glm_patch_hf_runner,
patch_hf_runner=model_utils.glm4v_patch_hf_runner,
# The image embeddings match with HF but the outputs of the language
# decoder are only consistent up to 2 decimal places.
# So, we need to reduce the number of tokens for the test to pass.
max_tokens=8,
num_logprobs=10,
marks=[large_gpu_mark(min_gb=32)],
),
"h2ovl": VLMTestInfo(

View File

@ -61,7 +61,9 @@ def run_test(
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_runner_kwargs_: dict[str, Any] = {}
vllm_runner_kwargs_: dict[str, Any] = {
"disable_mm_preprocessor_cache": True,
}
if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
if model_info.tokenizer_mode:

View File

@ -316,8 +316,8 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model
def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for GLM4."""
def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
hf_processor = hf_model.processor
patch_padding_side(hf_processor)
@ -325,12 +325,20 @@ def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
if images is None:
return hf_processor(*args, **kwargs)
images = [images] if isinstance(images, Image) else images
contents = re.findall(
r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
text,
)
assert len(contents) == len(images)
return hf_processor.apply_chat_template(
[{
"role": "user",
"image": images,
"content": text
}],
"image": image,
"content": content
} for image, content in zip(images, contents)],
add_generation_prompt=True,
tokenize=True,
return_dict=True,

View File

@ -286,14 +286,18 @@ class ModelConfig:
if rope_scaling is not None:
hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
hf_overrides_kw.update(hf_override)
msg = ("`--rope-scaling` will be removed in a future release. "
f"'Please instead use `--hf-overrides '{hf_override!r}'`")
hf_overrides_str = json.dumps(hf_overrides)
msg = (
"`--rope-scaling` will be removed in a future release. "
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
warnings.warn(DeprecationWarning(msg), stacklevel=2)
if rope_theta is not None:
hf_override = {"rope_theta": rope_theta}
hf_overrides_kw.update(hf_override)
msg = ("`--rope-theta` will be removed in a future release. "
f"'Please instead use `--hf-overrides '{hf_override!r}'`")
hf_overrides_str = json.dumps(hf_overrides)
msg = (
"`--rope-theta` will be removed in a future release. "
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
warnings.warn(DeprecationWarning(msg), stacklevel=2)
self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)

View File

@ -403,7 +403,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
hf_config = self._model_config.hf_config
model_type = hf_config.model_type
if modality in ["image", "image_embeds"]:
if modality in ("image", "image_embeds"):
if model_type == "chatglm":
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
if model_type == "phi3_v":
# Workaround since this token is not defined in the tokenizer
return f"<|image_{current_count}|>"
@ -411,8 +413,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
if model_type in ("minicpmo", "minicpmv"):
return "(<image>./</image>)"
if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
"pixtral"):
if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
# These models do not use image tokens in the prompt
return None
if model_type == "qwen":

View File

@ -2,6 +2,7 @@
# Adapted from
# https://github.com/THUDM/ChatGLM2-6B
"""Inference-only ChatGLM model compatible with THUDM weights."""
import json
from typing import Iterable, Optional, Set, Tuple, Union
import torch
@ -463,7 +464,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
"The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting "
f"`--hf-overrides {hf_overrides!r}`")
f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
super().__init__(vllm_config=vllm_config, prefix=prefix)

View File

@ -5,7 +5,7 @@
# Copyright (c) Alibaba Cloud.
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
"""Inference-only QWen model compatible with HuggingFace weights."""
import json
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
import torch
@ -354,7 +354,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
"The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting "
f"`--hf-overrides {hf_overrides!r}`")
f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
super().__init__(vllm_config=vllm_config, prefix=prefix)