[Bugfix] Clean up and fix multi-modal processors (#13012)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-02-10 18:45:21 +08:00 committed by GitHub
parent fde71262e0
commit 51f0b5f7f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 124 additions and 154 deletions

View File

@ -297,7 +297,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
* ✅
* ✅
* ?
* [](gh-issue:7968>)
* [](gh-issue:7968)
* ?
* ✅
*

View File

@ -26,6 +26,9 @@ from ...utils import check_logprobs_close
"google/gemma-1.1-2b-it", # gemma
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param(
"THUDM/chatglm3-6b", # ChatGLM (text-only)
),
pytest.param(
"meta-llama/Llama-3.2-1B-Instruct", # llama
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
@ -43,6 +46,9 @@ from ...utils import check_logprobs_close
"microsoft/phi-2", # phi
marks=[pytest.mark.core_model],
),
pytest.param(
"Qwen/Qwen-7B", # qwen (text-only)
),
pytest.param(
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
marks=[pytest.mark.core_model],
@ -68,6 +74,10 @@ def test_models(
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
if model.startswith("THUDM/chatglm3"):
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.transformer.output_layer
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)

View File

@ -89,7 +89,7 @@ def _test_processing_correctness(
mm_data = {
k:
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit))]
for _ in range(rng.randint(limit + 1))]
for k, limit in limit_mm_per_prompt.items()
}

View File

@ -17,10 +17,7 @@ def random_video(
min_wh: int,
max_wh: int,
):
# Temporary workaround for https://github.com/huggingface/transformers/issues/35412
num_frames = rng.randint(min_frames, max_frames)
num_frames = (num_frames // 2) * 2
w, h = rng.randint(min_wh, max_wh, size=(2, ))
return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)

View File

@ -4,8 +4,8 @@
# https://github.com/THUDM/CogAgent
"""Inference-only CogAgent model compatible with THUDM weights."""
from argparse import Namespace
from typing import (Iterable, List, Mapping, Optional, Sequence, Set, Tuple,
TypedDict, Union)
from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
Union)
import torch
from torch import nn
@ -19,7 +19,6 @@ from transformers.tokenization_utils_base import TextInput
from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@ -37,12 +36,10 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
from vllm.multimodal.parse import ImageSize, MultiModalDataItems
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, BatchFeature,
BoundPromptReplacement,
MultiModalFieldConfig,
PlaceholderFeaturesInfo,
PromptReplacement)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
@ -53,39 +50,6 @@ from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix, merge_multimodal_embeddings)
logger = init_logger(__name__)
IMAGE_TOKEN_ID = 151329
def build_normalization_transform(image_size: int) -> transforms.Compose:
"""
Build a normalization transform which can be applied to one or
more input images from which we want to extract visual features.
Args:
image_size: size of the image to be processed for visual embeddings.
Returns:
Callable transform for normalizing and resizing one RGB image.
"""
return transforms.Compose([
transforms.Resize(
(image_size, image_size),
interpolation=InterpolationMode.BICUBIC,
),
transforms.ToTensor(),
transforms.Normalize(
(0.48145466, 0.4578275, 0.40821073),
(0.26862954, 0.26130258, 0.27577711),
),
])
def calculate_image_placeholder(vision_config):
return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2
class GLMImagePixelInputs(TypedDict):
pixel_values: torch.Tensor
@ -109,9 +73,20 @@ class GLM4VProcessor:
self.config = config
self.tokenizer = tokenizer
if hasattr(self.config, "vision_config"):
self.image_transform = build_normalization_transform(
config.vision_config["image_size"])
if vision_config := getattr(config, "vision_config", None):
image_size = vision_config["image_size"]
self.image_transform = transforms.Compose([
transforms.Resize(
(image_size, image_size),
interpolation=InterpolationMode.BICUBIC,
),
transforms.ToTensor(),
transforms.Normalize(
mean=(0.48145466, 0.4578275, 0.40821073),
std=(0.26862954, 0.26130258, 0.27577711),
),
])
else:
self.image_transform = None
@ -150,9 +125,19 @@ class GLM4VProcessor:
class GLM4VProcessingInfo(BaseProcessingInfo):
def __init__(self, ctx):
super().__init__(ctx)
self._pre_calculate()
def get_tokenizer(self):
tokenizer = self.ctx.tokenizer
assert isinstance(tokenizer, PreTrainedTokenizer)
return tokenizer
def get_hf_config(self):
return self.ctx.get_hf_config(ChatGLMConfig)
def get_hf_processor(self) -> GLM4VProcessor:
return GLM4VProcessor(
self.get_hf_config(),
self.get_tokenizer(),
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1}
@ -162,27 +147,21 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {"image": self.image_token_num + 2}
def _pre_calculate(self):
hf_config = self.get_hf_config()
vision_config = hf_config.vision_config
self.image_token_num = calculate_image_placeholder(vision_config)
self.image_size = vision_config["image_size"]
return {"image": self.get_num_image_feature_tokens()}
def get_num_image_tokens(self) -> int:
return self.image_token_num + 2
hf_config = self.get_hf_config()
if not (vision_config := getattr(hf_config, "vision_config", None)):
return 0
def get_image_size(self) -> ImageSize:
image_size = vision_config["image_size"]
patch_size = vision_config["patch_size"]
grid_length = image_size // patch_size // 2
return grid_length * grid_length
return ImageSize(height=self.image_size, width=self.image_size)
def get_hf_processor(self) -> GLM4VProcessor:
return GLM4VProcessor(
self.get_hf_config(),
self.get_tokenizer(),
)
def get_num_image_feature_tokens(self) -> int:
# EVA2CLIPModel has embeddings for boi and eoi tokens as well
return self.get_num_image_tokens() + 2
class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
@ -192,8 +171,12 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
hf_config = self.info.get_hf_config()
if not (vision_config := getattr(hf_config, "vision_config", None)):
return ProcessorInputs(prompt_text="", mm_data={})
target_width = target_height = vision_config["image_size"]
num_images = mm_counts.get("image", 0)
target_width, target_height = self.info.get_image_size()
mm_data = {
"image":
@ -201,9 +184,11 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
height=target_height,
num_images=num_images)
}
text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
return ProcessorInputs(
prompt_text=text,
prompt_text=base_text * num_images,
mm_data=mm_data,
)
@ -223,47 +208,28 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
hf_config = self.info.get_hf_config()
if not hasattr(hf_config, "vision_config"):
return []
boi_token_id = hf_config.boi_token_id
image_token_id = hf_config.pad_token_id
eoi_token_id = hf_config.eoi_token_id
def get_replacement(item_idx: int):
image_tokens = self.info.image_token_num
return [IMAGE_TOKEN_ID] * image_tokens
num_image_tokens = self.info.get_num_image_tokens()
image_tokens = [image_token_id] * num_image_tokens
return [boi_token_id] + image_tokens + [eoi_token_id]
return [
PromptReplacement(
modality="image",
target=[IMAGE_TOKEN_ID],
target=[boi_token_id, image_token_id, eoi_token_id],
replacement=get_replacement,
),
]
def _apply_prompt_replacements(
self,
token_ids: list[int],
mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
mm_item_counts: Mapping[str, int],
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
token_ids, text, placeholders = super()._apply_prompt_replacements(
token_ids=token_ids,
mm_prompt_repls=mm_prompt_repls,
mm_item_counts=mm_item_counts,
)
hf_config = self.info.get_hf_config()
boi_token_id = hf_config.boi_token_id
eoi_token_id = hf_config.eoi_token_id
placeholders = {
modality: [
PlaceholderFeaturesInfo(
modality=p.modality,
item_idx=p.item_idx,
start_idx=p.start_idx - 1,
tokens=[boi_token_id] + p.tokens + [eoi_token_id],
) for p in ps
]
for modality, ps in placeholders.items()
}
return token_ids, text, placeholders
class GLMAttention(nn.Module):
@ -618,7 +584,7 @@ class ChatGLMModel(nn.Module):
multimodal_embeddings=multimodal_embeddings,
placeholder_token_id=[
self.config.boi_token_id,
IMAGE_TOKEN_ID,
self.config.pad_token_id,
self.config.eoi_token_id,
],
)

View File

@ -63,18 +63,6 @@ from .utils import (flatten_bn, is_pp_missing_parameter,
logger = init_logger(__name__)
# NOTE: Qwen models have a few other special tags, e.g., ref, bbox, quad;
# for the time being, these tags are not considered as special at encoding
# time. This may change as VLLMs multimodal API changes in the future.
IMG_START = "<img>"
IMG_END = "</img>"
IMG_PAD = "<imgpad>"
# Image context is fixed at 256 for all images
MAX_QWEN_IMG_TOKENS = 256
# Image normalization params
CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
class QwenImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
@ -622,25 +610,6 @@ class QWenModel(nn.Module):
return hidden_states
def build_normalization_transform(image_size: int) -> transforms.Compose:
"""
Build a normalization transform which can be applied to one or
more input images from which we want to extract visual features.
Args:
image_size: size of the image to be processed for visual embeddings.
Returns:
Callable transform for normalizing and resizing one RGB image.
"""
return transforms.Compose([
transforms.Resize((image_size, image_size),
interpolation=InterpolationMode.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=CLIP_MEAN, std=CLIP_STD),
])
@lru_cache(maxsize=1)
def _get_tokenizer_without_image_pad(
tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
@ -716,16 +685,34 @@ class QWenVLProcessor:
self.config = config
self.tokenizer = tokenizer
if hasattr(self.config, "visual"):
self.image_transform = build_normalization_transform(
config.visual["image_size"])
if vision_config := getattr(self.config, "visual", None):
image_size = vision_config["image_size"]
self.image_transform = transforms.Compose([
transforms.Resize(
(image_size, image_size),
interpolation=InterpolationMode.BICUBIC,
),
transforms.ToTensor(),
transforms.Normalize(
mean=(0.48145466, 0.4578275, 0.40821073),
std=(0.26862954, 0.26130258, 0.27577711),
),
])
else:
self.image_transform = None
special_tokens: dict[str,
int] = tokenizer.special_tokens # type: ignore
self.img_start_id = special_tokens[IMG_START]
self.img_end_id = special_tokens[IMG_END]
@property
def image_start_tag(self) -> str:
return self.tokenizer.image_start_tag # type: ignore
@property
def image_end_tag(self) -> str:
return self.tokenizer.image_end_tag # type: ignore
@property
def image_pad_tag(self) -> str:
return self.tokenizer.image_pad_tag # type: ignore
def __call__(
self,
@ -787,7 +774,14 @@ class QWenVLProcessingInfo(BaseProcessingInfo):
return {"image": self.get_num_image_tokens()}
def get_num_image_tokens(self) -> int:
return MAX_QWEN_IMG_TOKENS
hf_config = self.get_hf_config()
if not (vision_config := getattr(hf_config, "visual", None)):
return 0
image_size = vision_config["image_size"]
patch_size = vision_config["patch_size"]
grid_length = image_size // patch_size // 2
return grid_length * grid_length
class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):
@ -798,10 +792,12 @@ class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
hf_config = self.info.get_hf_config()
if not hasattr(hf_config, "visual"):
if not (vision_config := getattr(hf_config, "visual", None)):
return ProcessorInputs(prompt_text="", mm_data={})
vision_config = hf_config.visual
processor = self.info.get_hf_processor()
img_start = processor.image_start_tag
img_end = processor.image_end_tag
target_width = target_height = vision_config["image_size"]
num_images = mm_counts.get("image", 0)
@ -814,7 +810,7 @@ class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):
}
return ProcessorInputs(
prompt_text="".join(f"Picture {i}: {IMG_START}{IMG_END}\n"
prompt_text="".join(f"Picture {i}: {img_start}{img_end}\n"
for i in range(1, num_images + 1)),
mm_data=mm_data,
)
@ -869,13 +865,18 @@ class QWenVLMultiModalProcessor(BaseMultiModalProcessor[QWenVLProcessingInfo]):
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
hf_config = self.info.get_hf_config()
if not hasattr(hf_config, "visual"):
return []
tokenizer = self.info.get_tokenizer()
special_tokens: dict[str,
int] = tokenizer.special_tokens # type: ignore
img_start_id = special_tokens[IMG_START]
img_end_id = special_tokens[IMG_END]
img_pad_id = special_tokens[IMG_PAD]
processor = self.info.get_hf_processor()
img_start_id = special_tokens[processor.image_start_tag]
img_end_id = special_tokens[processor.image_end_tag]
img_pad_id = special_tokens[processor.image_pad_tag]
num_image_tokens = self.info.get_num_image_tokens()
image_tokens = [img_pad_id] * num_image_tokens

View File

@ -885,14 +885,10 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = self._get_max_video_frames(seq_len -
max_image_tokens)
num_frames = min(max(max_total_frames // max(max_videos, 1), 1),
_MAX_FRAMES_PER_VIDEO)
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
_MAX_FRAMES_PER_VIDEO)
# Temporary workaround for https://github.com/huggingface/transformers/issues/35412
if num_frames > 1 and num_frames % 2 == 1:
num_frames += 1
return num_frames
return max(max_frames_per_video, 1)
def get_max_video_tokens(self, seq_len: int) -> int:
target_width, target_height = self.get_image_size_with_most_features()