mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-17 02:17:03 +08:00
[Bugfix] Fix prompt format of GLM4V (#14539)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
b1cc4dfef5
commit
f53a0586b9
@ -254,13 +254,21 @@ VLM_TEST_SETTINGS = {
|
||||
"glm4v": VLMTestInfo(
|
||||
models=["THUDM/glm-4v-9b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
|
||||
}),
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
dtype="bfloat16",
|
||||
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
|
||||
patch_hf_runner=model_utils.glm_patch_hf_runner,
|
||||
patch_hf_runner=model_utils.glm4v_patch_hf_runner,
|
||||
# The image embeddings match with HF but the outputs of the language
|
||||
# decoder are only consistent up to 2 decimal places.
|
||||
# So, we need to reduce the number of tokens for the test to pass.
|
||||
max_tokens=8,
|
||||
num_logprobs=10,
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"h2ovl": VLMTestInfo(
|
||||
|
||||
@ -61,7 +61,9 @@ def run_test(
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
vllm_runner_kwargs_: dict[str, Any] = {}
|
||||
vllm_runner_kwargs_: dict[str, Any] = {
|
||||
"disable_mm_preprocessor_cache": True,
|
||||
}
|
||||
if model_info.tokenizer:
|
||||
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
|
||||
if model_info.tokenizer_mode:
|
||||
|
||||
@ -316,8 +316,8 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
return hf_model
|
||||
|
||||
|
||||
def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for GLM4."""
|
||||
def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
|
||||
hf_processor = hf_model.processor
|
||||
patch_padding_side(hf_processor)
|
||||
|
||||
@ -325,12 +325,20 @@ def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
if images is None:
|
||||
return hf_processor(*args, **kwargs)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
|
||||
contents = re.findall(
|
||||
r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
|
||||
text,
|
||||
)
|
||||
assert len(contents) == len(images)
|
||||
|
||||
return hf_processor.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"image": images,
|
||||
"content": text
|
||||
}],
|
||||
"image": image,
|
||||
"content": content
|
||||
} for image, content in zip(images, contents)],
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
|
||||
@ -286,14 +286,18 @@ class ModelConfig:
|
||||
if rope_scaling is not None:
|
||||
hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
|
||||
hf_overrides_kw.update(hf_override)
|
||||
msg = ("`--rope-scaling` will be removed in a future release. "
|
||||
f"'Please instead use `--hf-overrides '{hf_override!r}'`")
|
||||
hf_overrides_str = json.dumps(hf_overrides)
|
||||
msg = (
|
||||
"`--rope-scaling` will be removed in a future release. "
|
||||
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
|
||||
warnings.warn(DeprecationWarning(msg), stacklevel=2)
|
||||
if rope_theta is not None:
|
||||
hf_override = {"rope_theta": rope_theta}
|
||||
hf_overrides_kw.update(hf_override)
|
||||
msg = ("`--rope-theta` will be removed in a future release. "
|
||||
f"'Please instead use `--hf-overrides '{hf_override!r}'`")
|
||||
hf_overrides_str = json.dumps(hf_overrides)
|
||||
msg = (
|
||||
"`--rope-theta` will be removed in a future release. "
|
||||
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
|
||||
warnings.warn(DeprecationWarning(msg), stacklevel=2)
|
||||
|
||||
self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
|
||||
|
||||
@ -403,7 +403,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
hf_config = self._model_config.hf_config
|
||||
model_type = hf_config.model_type
|
||||
|
||||
if modality in ["image", "image_embeds"]:
|
||||
if modality in ("image", "image_embeds"):
|
||||
if model_type == "chatglm":
|
||||
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
|
||||
if model_type == "phi3_v":
|
||||
# Workaround since this token is not defined in the tokenizer
|
||||
return f"<|image_{current_count}|>"
|
||||
@ -411,8 +413,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
|
||||
if model_type in ("minicpmo", "minicpmv"):
|
||||
return "(<image>./</image>)"
|
||||
if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
|
||||
"pixtral"):
|
||||
if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
|
||||
# These models do not use image tokens in the prompt
|
||||
return None
|
||||
if model_type == "qwen":
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# Adapted from
|
||||
# https://github.com/THUDM/ChatGLM2-6B
|
||||
"""Inference-only ChatGLM model compatible with THUDM weights."""
|
||||
import json
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
|
||||
import torch
|
||||
@ -463,7 +464,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
|
||||
"The configuration of this model indicates that it supports "
|
||||
"vision inputs, but you instantiated the text-only version "
|
||||
"of this model. Please use the vision model by setting "
|
||||
f"`--hf-overrides {hf_overrides!r}`")
|
||||
f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
|
||||
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
# Copyright (c) Alibaba Cloud.
|
||||
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
|
||||
"""Inference-only QWen model compatible with HuggingFace weights."""
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
|
||||
|
||||
import torch
|
||||
@ -354,7 +354,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
|
||||
"The configuration of this model indicates that it supports "
|
||||
"vision inputs, but you instantiated the text-only version "
|
||||
"of this model. Please use the vision model by setting "
|
||||
f"`--hf-overrides {hf_overrides!r}`")
|
||||
f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
|
||||
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user