mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-13 16:17:02 +08:00
[VLM][Bugfix] Pass processor kwargs properly on init (#13516)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
52ce14d31f
commit
377d10bd14
@ -85,6 +85,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
mm_processor_kwargs={"max_dynamic_patch": 4},
|
||||
)
|
||||
|
||||
placeholders = "\n".join(f"Image-{i}: <image>\n"
|
||||
|
||||
@ -10,7 +10,7 @@ from vllm.config import ModelConfig
|
||||
from vllm.inputs import InputProcessingContext
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.processing import ProcessingCache
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from ....multimodal.utils import random_audio, random_image, random_video
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
@ -42,10 +42,7 @@ def _test_processing_correctness(
|
||||
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
|
||||
ctx = InputProcessingContext(
|
||||
model_config,
|
||||
tokenizer=cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
),
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
# Ensure that it can fit all of the data
|
||||
cache = ProcessingCache(capacity=1 << 30)
|
||||
|
||||
@ -1,17 +1,118 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Tests for H2OVL's multimodal preprocessing kwargs."""
|
||||
from typing import Optional
|
||||
from typing import Mapping, Optional
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
def _get_expected_num_patches(
|
||||
config: PretrainedConfig,
|
||||
image: Image.Image,
|
||||
num_imgs: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
|
||||
get_h2ovl_target_ratios)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
# Calculate the expected number of blocks
|
||||
if num_imgs == 1 and config.use_msac:
|
||||
# First pass
|
||||
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_h2ovl_target_ratios(
|
||||
min_num=1,
|
||||
max_num=max_num,
|
||||
prior_aspect_ratio=None,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False, # Thumbnail is handled separately
|
||||
)
|
||||
|
||||
# Second pass
|
||||
blocks2, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_h2ovl_target_ratios(
|
||||
min_num=3,
|
||||
max_num=max_num,
|
||||
prior_aspect_ratio=aspect_ratio,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# Add thumbnail if use_thumbnail is True and total_blocks > 1
|
||||
if config.use_thumbnail:
|
||||
blocks1 += 1 if blocks1 > 1 else 0
|
||||
blocks2 += 1 if blocks2 > 1 else 0
|
||||
|
||||
# Total blocks is the sum of blocks from both passes minus
|
||||
# overlapping
|
||||
total_blocks = blocks1 + blocks2 - 1
|
||||
|
||||
return total_blocks
|
||||
|
||||
blocks, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=None,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
expected_num_patches = blocks
|
||||
|
||||
if config.use_thumbnail and expected_num_patches > 1:
|
||||
expected_num_patches += 1
|
||||
|
||||
return expected_num_patches
|
||||
|
||||
|
||||
def _run_check(
|
||||
processor: BaseMultiModalProcessor,
|
||||
images: list[Image.Image],
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
mm_processor_kwargs: Mapping[str, object],
|
||||
):
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
config = processor.info.get_hf_config()
|
||||
|
||||
mm_data = {"image": images}
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images)
|
||||
|
||||
processed_inputs = processor.apply("<image>" * len(images), mm_data,
|
||||
mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
|
||||
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
@ -25,118 +126,54 @@ from ...utils import build_model_context
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
[4.0, 2.0, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize(
|
||||
("min_dynamic_patch", "max_dynamic_patch"),
|
||||
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
|
||||
)
|
||||
@pytest.mark.parametrize("dynamic_image_size", [True, False])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
image_assets: _ImageAssets,
|
||||
size_factors: list[int],
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: Optional[bool],
|
||||
num_imgs: int,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
|
||||
get_h2ovl_target_ratios)
|
||||
mm_processor_kwargs = {
|
||||
"min_dynamic_patch": min_dynamic_patch,
|
||||
"max_dynamic_patch": max_dynamic_patch,
|
||||
"dynamic_image_size": dynamic_image_size,
|
||||
}
|
||||
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
tokenizer = cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
trust_remote_code=ctx.model_config.trust_remote_code,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
config = processor.info.get_hf_config()
|
||||
use_msac = config.use_msac
|
||||
|
||||
mm_processor_kwargs = {
|
||||
"max_dynamic_patch": max_dynamic_patch,
|
||||
}
|
||||
if dynamic_image_size is not None:
|
||||
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
|
||||
|
||||
min_num = config.min_dynamic_patch
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_num = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
prompt = "<image>" * num_imgs
|
||||
|
||||
for asset in image_assets:
|
||||
for factor in size_factors:
|
||||
image = rescale_image_size(asset.pil_image, factor)
|
||||
mm_data = {"image": [image] * num_imgs}
|
||||
|
||||
width, height = image.size
|
||||
|
||||
# Calculate the expected number of blocks
|
||||
if num_imgs == 1 and use_msac:
|
||||
# First pass
|
||||
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=None,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False, # Thumbnail is handled separately
|
||||
)
|
||||
|
||||
# Second pass
|
||||
blocks2, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=aspect_ratio,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# Add thumbnail if use_thumbnail is True and total_blocks > 1
|
||||
if config.use_thumbnail:
|
||||
blocks1 += 1 if blocks1 > 1 else 0
|
||||
blocks2 += 1 if blocks2 > 1 else 0
|
||||
|
||||
# Total blocks is the sum of blocks from both passes minus
|
||||
# overlapping
|
||||
total_blocks = blocks1 + blocks2 - 1
|
||||
|
||||
expected_num_patches = total_blocks
|
||||
else:
|
||||
blocks, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=None,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
expected_num_patches = blocks
|
||||
|
||||
if config.use_thumbnail and expected_num_patches != 1:
|
||||
expected_num_patches += 1
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data,
|
||||
mm_processor_kwargs)
|
||||
pixel_shape = (
|
||||
processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
|
||||
|
||||
assert pixel_shape[0] == expected_num_patches * num_imgs
|
||||
_run_check(
|
||||
processor,
|
||||
[
|
||||
rescale_image_size(image_assets[0].pil_image, f)
|
||||
for f in size_factors
|
||||
],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
)
|
||||
|
||||
@ -4,7 +4,7 @@ import pytest
|
||||
from transformers import Idefics3Config
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
@ -22,9 +22,15 @@ models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
|
||||
])
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_override(image_assets: _ImageAssets, model: str,
|
||||
mm_processor_kwargs: dict[str, object],
|
||||
expected_toks_per_img: int, num_imgs: int):
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: _ImageAssets,
|
||||
model: str,
|
||||
mm_processor_kwargs: dict[str, object],
|
||||
expected_toks_per_img: int,
|
||||
num_imgs: int,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
@ -33,15 +39,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
|
||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
placeholders = "<image>" if num_imgs == 1 else "\n".join(
|
||||
@ -54,8 +60,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
|
||||
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
|
||||
mm_data = {"image": [dummy_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure the placeholders format are correct
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
|
||||
"input_ids"][0]
|
||||
|
||||
@ -1,64 +1,136 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Tests for InternVL's multimodal preprocessing kwargs."""
|
||||
from typing import Optional
|
||||
from typing import Mapping, Optional
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
|
||||
@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
|
||||
@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
image_assets: _ImageAssets,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: Optional[bool],
|
||||
def _get_expected_num_patches(
|
||||
config: PretrainedConfig,
|
||||
image: Image.Image,
|
||||
num_imgs: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
tokenizer = cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
trust_remote_code=ctx.model_config.trust_remote_code,
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=tokenizer,
|
||||
from vllm.model_executor.models.internvl import (
|
||||
calculate_internvl_targets, get_internvl_target_ratios)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
blocks, _, _ = calculate_internvl_targets(
|
||||
orig_width=width,
|
||||
orig_height=height,
|
||||
target_ratios=get_internvl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
),
|
||||
image_size=config.vision_config.image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
expected_num_patches = blocks
|
||||
|
||||
mm_processor_kwargs = {
|
||||
"max_dynamic_patch": max_dynamic_patch,
|
||||
}
|
||||
if dynamic_image_size is not None:
|
||||
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
|
||||
if config.use_thumbnail and expected_num_patches > 1:
|
||||
expected_num_patches += 1
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
prompt = "<image>" * num_imgs
|
||||
image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
|
||||
mm_data = {"image": [image] * num_imgs}
|
||||
return expected_num_patches
|
||||
|
||||
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
|
||||
if dynamic_image_size is False:
|
||||
expected_num_patches = 1
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
def _run_check(
|
||||
processor: BaseMultiModalProcessor,
|
||||
images: list[Image.Image],
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
mm_processor_kwargs: Mapping[str, object],
|
||||
):
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
config = processor.info.get_hf_config()
|
||||
|
||||
mm_data = {"image": images}
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images)
|
||||
|
||||
processed_inputs = processor.apply("<image>" * len(images), mm_data,
|
||||
mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
|
||||
|
||||
assert img_tok_count == 256 * expected_num_patches * num_imgs
|
||||
assert pixel_shape[0] == expected_num_patches * num_imgs
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
[4.0, 2.0, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
("min_dynamic_patch", "max_dynamic_patch"),
|
||||
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
|
||||
)
|
||||
@pytest.mark.parametrize("dynamic_image_size", [True, False])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
image_assets: _ImageAssets,
|
||||
size_factors: list[int],
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: Optional[bool],
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
mm_processor_kwargs = {
|
||||
"min_dynamic_patch": min_dynamic_patch,
|
||||
"max_dynamic_patch": max_dynamic_patch,
|
||||
"dynamic_image_size": dynamic_image_size,
|
||||
}
|
||||
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_num = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
_run_check(
|
||||
processor,
|
||||
[
|
||||
rescale_image_size(image_assets[0].pil_image, f)
|
||||
for f in size_factors
|
||||
],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
)
|
||||
|
||||
@ -10,7 +10,7 @@ from pqdm.threads import pqdm
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.parse import ImageSize
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
trust_remote_code=ctx.model_config.trust_remote_code,
|
||||
),
|
||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
||||
)
|
||||
info = processor.info
|
||||
|
||||
@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
trust_remote_code=ctx.model_config.trust_remote_code,
|
||||
),
|
||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
||||
)
|
||||
|
||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||
@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
trust_remote_code=ctx.model_config.trust_remote_code,
|
||||
),
|
||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
||||
)
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
|
||||
@ -10,7 +10,7 @@ from pqdm.threads import pqdm
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.parse import ImageSize
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
trust_remote_code=ctx.model_config.trust_remote_code,
|
||||
),
|
||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
||||
)
|
||||
info = processor.info
|
||||
|
||||
@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
trust_remote_code=ctx.model_config.trust_remote_code,
|
||||
),
|
||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
||||
)
|
||||
|
||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||
@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
trust_remote_code=ctx.model_config.trust_remote_code,
|
||||
),
|
||||
tokenizer=cached_tokenizer_from_config(ctx.model_config),
|
||||
)
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
@ -21,12 +21,14 @@ from ...utils import build_model_context
|
||||
])
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: _ImageAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict[str, int],
|
||||
expected_toks_per_img: int,
|
||||
num_imgs: int,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
"""Ensure input_processor_for_phi3v handles num_crops properly."""
|
||||
# Avoid initializing CUDA early
|
||||
@ -36,23 +38,22 @@ def test_processor_override(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
tokenizer = cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
trust_remote_code=ctx.model_config.trust_remote_code,
|
||||
)
|
||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
|
||||
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
|
||||
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
@ -18,6 +18,7 @@ from ...utils import build_model_context
|
||||
])
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: _ImageAssets,
|
||||
model_id: str,
|
||||
@ -25,31 +26,30 @@ def test_processor_override(
|
||||
expected_toks_per_img: int,
|
||||
expected_pixels_shape: tuple[int, int],
|
||||
num_imgs: int,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
mm_processor_kwargs=None,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
tokenizer = cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
trust_remote_code=ctx.model_config.trust_remote_code,
|
||||
)
|
||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
|
||||
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
|
||||
|
||||
@ -248,13 +248,16 @@ def check_logprobs_close(
|
||||
warnings.warn(fail_msg, stacklevel=2)
|
||||
|
||||
|
||||
def build_model_context(model_name: str,
|
||||
task: TaskOption = "auto",
|
||||
tokenizer_name: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
mm_processor_kwargs: Optional[Dict] = None,
|
||||
limit_mm_per_prompt: Optional[Dict] = None):
|
||||
def build_model_context(
|
||||
model_name: str,
|
||||
task: TaskOption = "auto",
|
||||
tokenizer_name: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
mm_processor_kwargs: Optional[Dict] = None,
|
||||
limit_mm_per_prompt: Optional[Dict] = None,
|
||||
disable_mm_preprocessor_cache: bool = True,
|
||||
):
|
||||
"""Creates an InputContext for a given model.
|
||||
|
||||
Args:
|
||||
@ -283,5 +286,6 @@ def build_model_context(model_name: str,
|
||||
seed=0,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
|
||||
)
|
||||
return InputContext(model_config)
|
||||
|
||||
@ -22,8 +22,8 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
|
||||
replace_token_matches)
|
||||
# yapf: enable
|
||||
from vllm.multimodal.profiling import MultiModalProfiler
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||
cached_tokenizer_from_config)
|
||||
from vllm.utils import full_groupby
|
||||
|
||||
from .utils import random_image
|
||||
@ -577,7 +577,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
||||
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
model_config,
|
||||
tokenizer=cached_get_tokenizer(model_config.tokenizer),
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
|
||||
@ -617,7 +617,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
||||
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
model_config,
|
||||
tokenizer=cached_get_tokenizer(model_config.tokenizer),
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
@ -689,7 +689,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
|
||||
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
model_config,
|
||||
tokenizer=cached_get_tokenizer(model_config.tokenizer),
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
orig_get_hf_processor = processor.info.get_hf_processor
|
||||
|
||||
|
||||
@ -11,8 +11,9 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
|
||||
from typing_extensions import TypeVar, assert_never
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import cached_get_processor
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||
cached_tokenizer_from_config)
|
||||
from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
|
||||
resolve_mm_processor_kwargs)
|
||||
|
||||
@ -27,19 +28,9 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig)
|
||||
P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin)
|
||||
|
||||
|
||||
class HashableDict(dict):
|
||||
"""
|
||||
A dictionary that can be hashed by lru_cache.
|
||||
"""
|
||||
|
||||
# NOTE: pythonic dict is not hashable,
|
||||
# we override on it directly for simplicity
|
||||
def __hash__(self) -> int: # type: ignore[override]
|
||||
return hash(frozenset(self.items()))
|
||||
_T = TypeVar("_T")
|
||||
_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
|
||||
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@ -54,9 +45,9 @@ class InputContext:
|
||||
|
||||
def get_hf_config(
|
||||
self,
|
||||
typ: Union[type[C], tuple[type[C], ...]] = PretrainedConfig,
|
||||
typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig,
|
||||
/,
|
||||
) -> C:
|
||||
) -> _C:
|
||||
"""
|
||||
Get the HuggingFace configuration
|
||||
(:class:`transformers.PretrainedConfig`) of the model,
|
||||
@ -94,10 +85,10 @@ class InputContext:
|
||||
|
||||
def get_hf_processor(
|
||||
self,
|
||||
typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
|
||||
typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
/,
|
||||
**kwargs: object,
|
||||
) -> P:
|
||||
) -> _P:
|
||||
"""
|
||||
Get the HuggingFace processor
|
||||
(:class:`transformers.ProcessorMixin`) of the model,
|
||||
@ -106,33 +97,29 @@ class InputContext:
|
||||
Raises:
|
||||
TypeError: If the processor is not of the specified type.
|
||||
"""
|
||||
return cached_processor_from_config(
|
||||
self.model_config,
|
||||
processor_cls=typ,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def init_processor(
|
||||
self,
|
||||
typ: type[_T],
|
||||
/,
|
||||
**kwargs: object,
|
||||
) -> _T:
|
||||
"""
|
||||
Initialize a HuggingFace-like processor class, merging the
|
||||
keyword arguments with those in the model's configuration.
|
||||
"""
|
||||
base_kwargs = self.model_config.mm_processor_kwargs
|
||||
if base_kwargs is None:
|
||||
base_kwargs = {}
|
||||
|
||||
merged_kwargs = {**base_kwargs, **kwargs}
|
||||
|
||||
if isinstance(typ, type):
|
||||
merged_kwargs["processor_cls"] = typ
|
||||
|
||||
# NOTE: Pythonic dict is not hashable and will raise unhashable type
|
||||
# error when calling `cached_get_processor`, therefore we need to
|
||||
# wrap it to a hashable dict.
|
||||
for key, value in merged_kwargs.items():
|
||||
if isinstance(value, dict):
|
||||
merged_kwargs[key] = HashableDict(value)
|
||||
|
||||
hf_processor = cached_get_processor(
|
||||
self.model_config.model,
|
||||
trust_remote_code=self.model_config.trust_remote_code,
|
||||
**merged_kwargs,
|
||||
)
|
||||
if not isinstance(hf_processor, typ):
|
||||
raise TypeError("Invalid type of HuggingFace processor. "
|
||||
f"Expected type: {typ}, but "
|
||||
f"found type: {type(hf_processor)}")
|
||||
|
||||
return hf_processor
|
||||
return typ(**merged_kwargs)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@ -142,10 +129,10 @@ class InputProcessingContext(InputContext):
|
||||
|
||||
def get_hf_processor(
|
||||
self,
|
||||
typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
|
||||
typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
/,
|
||||
**kwargs: object,
|
||||
) -> P:
|
||||
) -> _P:
|
||||
return super().get_hf_processor(
|
||||
typ,
|
||||
tokenizer=self.tokenizer,
|
||||
@ -341,13 +328,9 @@ class InputRegistry:
|
||||
from vllm.model_executor.model_loader import get_model_architecture
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.multimodal.profiling import MultiModalProfiler
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
|
||||
if mm_registry.has_processor(model_config):
|
||||
tokenizer = cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
processor = mm_registry.create_processor(model_config, tokenizer)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
dummy_data = profiler.get_dummy_data(
|
||||
|
||||
@ -400,8 +400,8 @@ class AriaProcessingInfo(BaseProcessingInfo):
|
||||
def get_vision_config(self):
|
||||
return self.get_hf_config().vision_config
|
||||
|
||||
def get_hf_processor(self):
|
||||
return self.ctx.get_hf_processor(AriaProcessor)
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(AriaProcessor, **kwargs)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
@ -58,8 +58,8 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config(ChameleonConfig)
|
||||
|
||||
def get_hf_processor(self):
|
||||
return self.ctx.get_hf_processor(ChameleonProcessor)
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(ChameleonProcessor, **kwargs)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": 1}
|
||||
|
||||
@ -28,13 +28,13 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
|
||||
MlpProjectorConfig,
|
||||
VisionEncoderConfig)
|
||||
from vllm.transformers_utils.processors.deepseek_vl2 import (
|
||||
DeepseekVLV2Processor)
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .interfaces import SupportsMultiModal, SupportsPP
|
||||
@ -133,8 +133,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config(DeepseekVLV2Config)
|
||||
|
||||
def get_hf_processor(self) -> DeepseekVLV2Processor:
|
||||
return self.ctx.get_hf_processor(DeepseekVLV2Processor)
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(DeepseekVLV2Processor, **kwargs)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
@ -308,13 +308,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
self.text_config = config.text_config
|
||||
|
||||
model_config = vllm_config.model_config
|
||||
tokenizer = cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
tokenizer_revision=model_config.tokenizer_revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
self.image_token_id = tokenizer.vocab.get(_IMAGE_TOKEN)
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
|
||||
|
||||
self.vision = self._init_vision_module(self.vision_config,
|
||||
quant_config,
|
||||
|
||||
@ -71,8 +71,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config(FuyuConfig)
|
||||
|
||||
def get_hf_processor(self):
|
||||
return self.ctx.get_hf_processor(FuyuProcessor)
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(FuyuProcessor, **kwargs)
|
||||
|
||||
def get_image_processor(self) -> FuyuImageProcessor:
|
||||
return self.get_hf_processor().image_processor
|
||||
|
||||
@ -416,18 +416,15 @@ class GLM4VProcessor:
|
||||
|
||||
class GLM4VProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_tokenizer(self):
|
||||
tokenizer = self.ctx.tokenizer
|
||||
assert isinstance(tokenizer, PreTrainedTokenizer)
|
||||
return tokenizer
|
||||
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config(ChatGLMConfig)
|
||||
|
||||
def get_hf_processor(self) -> GLM4VProcessor:
|
||||
return GLM4VProcessor(
|
||||
self.get_hf_config(),
|
||||
self.get_tokenizer(),
|
||||
def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
|
||||
return self.ctx.init_processor(
|
||||
GLM4VProcessor,
|
||||
config=self.get_hf_config(),
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
|
||||
@ -15,9 +15,9 @@ from vllm.model_executor.layers.pooler import PoolerHead
|
||||
from vllm.model_executor.models.llama import LlamaForCausalLM
|
||||
from vllm.model_executor.pooling_metadata import (PoolingMetadata,
|
||||
PoolingTensors)
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.sequence import (IntermediateTensors, PoolerOutput,
|
||||
PoolingSequenceGroupOutput)
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -29,12 +29,7 @@ class GritLMPooler(nn.Module):
|
||||
|
||||
self.model_config = model_config
|
||||
|
||||
tokenizer = cached_get_tokenizer(
|
||||
self.model_config.tokenizer,
|
||||
tokenizer_mode=self.model_config.tokenizer_mode,
|
||||
tokenizer_revision=self.model_config.tokenizer_revision,
|
||||
trust_remote_code=self.model_config.trust_remote_code,
|
||||
)
|
||||
tokenizer = cached_tokenizer_from_config(self.model_config)
|
||||
|
||||
# Collect the tokens needed for pattern matching.
|
||||
# "▁<" is different from "_<". The former uses "▁" to indicate that
|
||||
|
||||
@ -41,6 +41,7 @@ def resolve_h2ovl_min_max_num(
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
if use_thumbnail and max_dynamic_patch != 1:
|
||||
@ -190,7 +191,7 @@ def image_to_pixel_values_h2ovl(
|
||||
pixel_values1, aspect_ratio1 = _preprocess_image(
|
||||
image,
|
||||
input_size=input_size,
|
||||
min_num=min_num,
|
||||
min_num=1,
|
||||
max_num=max_num,
|
||||
use_thumbnail=True,
|
||||
prior_aspect_ratio=None,
|
||||
@ -199,7 +200,7 @@ def image_to_pixel_values_h2ovl(
|
||||
pixel_values2, _ = _preprocess_image(
|
||||
image,
|
||||
input_size=input_size,
|
||||
min_num=3, # Hardcoded value
|
||||
min_num=3,
|
||||
max_num=max_num,
|
||||
use_thumbnail=True,
|
||||
prior_aspect_ratio=aspect_ratio1,
|
||||
@ -228,6 +229,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
config: PretrainedConfig,
|
||||
tokenizer: AnyTokenizer,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
use_msac: Optional[bool] = None,
|
||||
@ -235,6 +237,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
super().__init__(
|
||||
config,
|
||||
tokenizer,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
@ -267,11 +270,13 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
use_thumbnail: Optional[bool] = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = self.min_dynamic_patch
|
||||
min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
|
||||
is None else min_dynamic_patch)
|
||||
max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
|
||||
is None else max_dynamic_patch)
|
||||
dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
|
||||
@ -289,18 +294,21 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
use_thumbnail: Optional[bool] = None,
|
||||
prior_aspect_ratio: Optional[tuple[int, int]] = None,
|
||||
override_min_num: Optional[int] = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
if prior_aspect_ratio: # hardcoded value for second pass of use_msac
|
||||
min_num = 3
|
||||
if override_min_num is not None:
|
||||
min_num = override_min_num
|
||||
|
||||
return get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
@ -322,6 +330,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
if use_msac:
|
||||
target_ratios_1 = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
override_min_num=1,
|
||||
)
|
||||
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
@ -334,6 +343,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
target_ratios_2 = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
prior_aspect_ratio=aspect_ratio_1,
|
||||
override_min_num=3,
|
||||
)
|
||||
num_patches_2, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
@ -361,12 +371,14 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
) -> list[torch.Tensor]:
|
||||
use_msac = self.use_msac if len(images) == 1 else False
|
||||
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
@ -389,14 +401,23 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
**kwargs: object,
|
||||
) -> H2OVLProcessor:
|
||||
return H2OVLProcessor(
|
||||
self.get_hf_config(),
|
||||
self.get_tokenizer(),
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
if min_dynamic_patch is not None:
|
||||
kwargs["min_dynamic_patch"] = min_dynamic_patch
|
||||
if max_dynamic_patch is not None:
|
||||
kwargs["max_dynamic_patch"] = max_dynamic_patch
|
||||
if dynamic_image_size is not None:
|
||||
kwargs["dynamic_image_size"] = dynamic_image_size
|
||||
|
||||
return self.ctx.init_processor(
|
||||
H2OVLProcessor,
|
||||
config=self.get_hf_config(),
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
|
||||
@ -83,13 +83,15 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
|
||||
class Idefics3ProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
size: Optional[Dict[str, int]] = None) -> Idefics3Processor:
|
||||
self,
|
||||
*,
|
||||
size: Optional[Dict[str, int]] = None,
|
||||
**kwargs: object,
|
||||
) -> Idefics3Processor:
|
||||
if size is not None:
|
||||
return self.ctx.get_hf_processor(Idefics3Processor, size=size)
|
||||
kwargs["size"] = size
|
||||
|
||||
return self.ctx.get_hf_processor(Idefics3Processor)
|
||||
return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
@ -120,6 +120,7 @@ def resolve_internvl_min_max_num(
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
if use_thumbnail and max_dynamic_patch != 1:
|
||||
@ -247,6 +248,7 @@ class BaseInternVLProcessor(ABC):
|
||||
config: PretrainedConfig,
|
||||
tokenizer: AnyTokenizer,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
) -> None:
|
||||
@ -258,18 +260,22 @@ class BaseInternVLProcessor(ABC):
|
||||
image_size: int = config.vision_config.image_size
|
||||
patch_size: int = config.vision_config.patch_size
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = config.dynamic_image_size
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = config.min_dynamic_patch
|
||||
assert isinstance(min_dynamic_patch, int)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = config.max_dynamic_patch
|
||||
assert isinstance(max_dynamic_patch, int)
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = config.dynamic_image_size
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size)**2 * (config.downsample_ratio**2))
|
||||
self.image_size = image_size
|
||||
self.min_dynamic_patch: int = config.min_dynamic_patch
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
self.use_thumbnail: bool = config.use_thumbnail
|
||||
@ -298,11 +304,13 @@ class BaseInternVLProcessor(ABC):
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
use_thumbnail: Optional[bool] = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = self.min_dynamic_patch
|
||||
min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
|
||||
is None else min_dynamic_patch)
|
||||
max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
|
||||
is None else max_dynamic_patch)
|
||||
dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
|
||||
@ -320,11 +328,13 @@ class BaseInternVLProcessor(ABC):
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
use_thumbnail: Optional[bool] = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
@ -355,10 +365,12 @@ class BaseInternVLProcessor(ABC):
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
@ -378,6 +390,7 @@ class BaseInternVLProcessor(ABC):
|
||||
self,
|
||||
text: Optional[Union[str, list[str]]] = None,
|
||||
images: Optional[Union[Image.Image, list[Image.Image]]] = None,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
@ -396,6 +409,7 @@ class BaseInternVLProcessor(ABC):
|
||||
else:
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
@ -451,8 +465,10 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
**kwargs: object,
|
||||
) -> BaseInternVLProcessor:
|
||||
raise NotImplementedError
|
||||
|
||||
@ -642,14 +658,23 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
**kwargs: object,
|
||||
) -> InternVLProcessor:
|
||||
return InternVLProcessor(
|
||||
self.get_hf_config(),
|
||||
self.get_tokenizer(),
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
if min_dynamic_patch is not None:
|
||||
kwargs["min_dynamic_patch"] = min_dynamic_patch
|
||||
if max_dynamic_patch is not None:
|
||||
kwargs["max_dynamic_patch"] = max_dynamic_patch
|
||||
if dynamic_image_size is not None:
|
||||
kwargs["dynamic_image_size"] = dynamic_image_size
|
||||
|
||||
return self.ctx.init_processor(
|
||||
InternVLProcessor,
|
||||
config=self.get_hf_config(),
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -119,7 +119,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
|
||||
return get_vision_encoder_info(self.get_hf_config())
|
||||
|
||||
@abstractmethod
|
||||
def get_hf_processor(self) -> LlavaLikeProcessor:
|
||||
def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
|
||||
raise NotImplementedError
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
@ -208,8 +208,8 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
|
||||
class LlavaProcessingInfo(BaseLlavaProcessingInfo):
|
||||
|
||||
def get_hf_processor(self):
|
||||
return self.ctx.get_hf_processor(LlavaProcessor)
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
|
||||
|
||||
|
||||
class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
@ -272,8 +272,8 @@ class LlavaMultiModalProcessor(
|
||||
|
||||
class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
|
||||
|
||||
def get_hf_processor(self):
|
||||
return self.ctx.get_hf_processor(PixtralProcessor)
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
|
||||
|
||||
|
||||
class PixtralHFMultiModalProcessor(
|
||||
@ -742,23 +742,24 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
|
||||
class MantisProcessingInfo(LlavaProcessingInfo):
|
||||
|
||||
def get_hf_processor(self):
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
hf_config = self.get_hf_config()
|
||||
vision_info = self.get_vision_encoder_info()
|
||||
|
||||
kwargs.setdefault("patch_size", vision_info.get_patch_size())
|
||||
|
||||
if Version(TRANSFORMERS_VERSION) < Version("4.48"):
|
||||
# BUG: num_additional_image_tokens = 0 but treated as 1,
|
||||
# so we set vision_feature_select_strategy to None to offset this
|
||||
vision_feature_select_strategy = None
|
||||
kwargs.setdefault("vision_feature_select_strategy", None)
|
||||
else:
|
||||
# FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
|
||||
vision_feature_select_strategy = hf_config.vision_feature_select_strategy # noqa: E501
|
||||
kwargs.setdefault(
|
||||
"vision_feature_select_strategy",
|
||||
hf_config.vision_feature_select_strategy,
|
||||
)
|
||||
|
||||
return self.ctx.get_hf_processor(
|
||||
LlavaProcessor,
|
||||
patch_size=vision_info.get_patch_size(),
|
||||
vision_feature_select_strategy=vision_feature_select_strategy,
|
||||
)
|
||||
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
|
||||
|
||||
|
||||
class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
||||
|
||||
@ -72,8 +72,8 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
|
||||
def get_hf_config(self) -> LlavaNextLikeConfig:
|
||||
return self.ctx.get_hf_config(LlavaNextConfig)
|
||||
|
||||
def get_hf_processor(self):
|
||||
hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor)
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor, **kwargs)
|
||||
|
||||
# In case patch_size is omitted from `processor_config.json`
|
||||
# e.g. for E5-V: https://huggingface.co/royokong/e5-v
|
||||
|
||||
@ -56,8 +56,8 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
|
||||
def get_vision_encoder_info(self):
|
||||
return get_vision_encoder_info(self.get_hf_config())
|
||||
|
||||
def get_hf_processor(self):
|
||||
return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(LlavaNextVideoProcessor, **kwargs)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"video": 1}
|
||||
|
||||
@ -97,8 +97,8 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
|
||||
def get_hf_config(self) -> LlavaOnevisionLikeConfig:
|
||||
return self.ctx.get_hf_config(LlavaOnevisionConfig)
|
||||
|
||||
def get_hf_processor(self):
|
||||
return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(LlavaOnevisionProcessor, **kwargs)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": None}
|
||||
|
||||
@ -331,11 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config()
|
||||
|
||||
def get_hf_processor(
|
||||
self,
|
||||
**kwargs: object,
|
||||
):
|
||||
hf_processor = self.ctx.get_hf_processor()
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
hf_processor = self.ctx.get_hf_processor(**kwargs)
|
||||
|
||||
# NumPy arrays are considered as Iterable but not Sequence in
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428
|
||||
|
||||
@ -94,8 +94,8 @@ class MllamaProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_config(self) -> MllamaConfig:
|
||||
return self.ctx.get_hf_config(MllamaConfig)
|
||||
|
||||
def get_hf_processor(self) -> MllamaProcessor:
|
||||
return self.ctx.get_hf_processor(MllamaProcessor)
|
||||
def get_hf_processor(self, **kwargs: object) -> MllamaProcessor:
|
||||
return self.ctx.get_hf_processor(MllamaProcessor, **kwargs)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
@ -1200,8 +1200,8 @@ class MolmoProcessorWrapper:
|
||||
|
||||
class MolmoProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_hf_processor(self) -> MolmoProcessorWrapper:
|
||||
processor = self.ctx.get_hf_processor()
|
||||
def get_hf_processor(self, **kwargs: object) -> MolmoProcessorWrapper:
|
||||
processor = self.ctx.get_hf_processor(**kwargs)
|
||||
return MolmoProcessorWrapper(processor)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
|
||||
@ -69,14 +69,23 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
|
||||
def get_hf_processor(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: Optional[int] = None,
|
||||
max_dynamic_patch: Optional[int] = None,
|
||||
dynamic_image_size: Optional[bool] = None,
|
||||
**kwargs: object,
|
||||
) -> NVLMProcessor:
|
||||
return NVLMProcessor(
|
||||
self.get_hf_config(),
|
||||
self.get_tokenizer(),
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
if min_dynamic_patch is not None:
|
||||
kwargs["min_dynamic_patch"] = min_dynamic_patch
|
||||
if max_dynamic_patch is not None:
|
||||
kwargs["max_dynamic_patch"] = max_dynamic_patch
|
||||
if dynamic_image_size is not None:
|
||||
kwargs["dynamic_image_size"] = dynamic_image_size
|
||||
|
||||
return self.ctx.init_processor(
|
||||
NVLMProcessor,
|
||||
config=self.get_hf_config(),
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
|
||||
@ -16,8 +16,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import NestedTensors
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from .interfaces import SupportsMultiModal, SupportsPP
|
||||
from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
|
||||
@ -88,7 +88,7 @@ def input_processor_for_paligemma(ctx: InputContext,
|
||||
model_config = ctx.model_config
|
||||
hf_config = ctx.get_hf_config(PaliGemmaConfig)
|
||||
|
||||
tokenizer = cached_get_tokenizer(model_config.tokenizer)
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
image_feature_size = hf_config.text_config.num_image_tokens
|
||||
image_token_str = tokenizer.decode(hf_config.image_token_index)
|
||||
bos_token = tokenizer.decode(hf_config.bos_token_id)
|
||||
|
||||
@ -313,11 +313,12 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
|
||||
self,
|
||||
*,
|
||||
num_crops: Optional[int] = None,
|
||||
**kwargs: object,
|
||||
) -> ProcessorMixin:
|
||||
if num_crops is not None:
|
||||
return self.ctx.get_hf_processor(num_crops=num_crops)
|
||||
kwargs["num_crops"] = num_crops
|
||||
|
||||
return self.ctx.get_hf_processor()
|
||||
return self.ctx.get_hf_processor(**kwargs)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
@ -32,9 +32,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
||||
from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
|
||||
from vllm.multimodal.utils import (cached_get_tokenizer,
|
||||
consecutive_placeholder_ranges)
|
||||
from vllm.multimodal.utils import consecutive_placeholder_ranges
|
||||
from vllm.sequence import IntermediateTensors, SequenceData
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
|
||||
from .interfaces import SupportsMultiModal, SupportsPP
|
||||
from .utils import (init_vllm_registered_model, maybe_prefix,
|
||||
@ -49,9 +49,7 @@ except ImportError:
|
||||
|
||||
|
||||
def get_max_pixtral_image_tokens(ctx: InputContext):
|
||||
tokenizer = cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
tokenizer_mode=ctx.model_config.tokenizer_mode)
|
||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
||||
mm_encoder = tokenizer.instruct.mm_encoder
|
||||
|
||||
image_config = mm_encoder.mm_config if hasattr(
|
||||
@ -65,9 +63,7 @@ def get_max_pixtral_image_tokens(ctx: InputContext):
|
||||
|
||||
def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
|
||||
mm_counts: Mapping[str, int]):
|
||||
tokenizer = cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
tokenizer_mode=ctx.model_config.tokenizer_mode)
|
||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
||||
|
||||
mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
|
||||
image_token_id = mm_encoder.special_ids.img
|
||||
@ -109,9 +105,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
|
||||
MultiModalKwargs containing the stacked normalized images tensor or
|
||||
image embeddings.
|
||||
"""
|
||||
model_config = ctx.model_config
|
||||
tokenizer = cached_get_tokenizer(
|
||||
model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
|
||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
||||
|
||||
data_list = data if isinstance(data, list) else [data]
|
||||
|
||||
@ -138,9 +132,7 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
|
||||
|
||||
prompt_token_ids = inputs.get("prompt_token_ids")
|
||||
prompt = inputs.get("prompt")
|
||||
tokenizer = cached_get_tokenizer(
|
||||
ctx.model_config.tokenizer,
|
||||
tokenizer_mode=ctx.model_config.tokenizer_mode)
|
||||
tokenizer = cached_tokenizer_from_config(ctx.model_config)
|
||||
|
||||
mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
|
||||
image_token_id = mm_encoder.special_ids.img
|
||||
|
||||
@ -36,8 +36,6 @@ from transformers import BatchFeature
|
||||
from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
|
||||
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
|
||||
Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
|
||||
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
|
||||
Qwen2VLImageProcessorFast)
|
||||
|
||||
from vllm.attention import AttentionMetadata
|
||||
from vllm.config import VllmConfig
|
||||
@ -690,41 +688,20 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
|
||||
*,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
fps: Optional[float] = 2.0,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
fps: Optional[float] = None,
|
||||
**kwargs: object,
|
||||
) -> Qwen2_5_VLProcessor:
|
||||
hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor)
|
||||
image_processor = hf_processor.image_processor # type: ignore
|
||||
assert isinstance(image_processor,
|
||||
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
|
||||
if fps is not None:
|
||||
kwargs["fps"] = fps
|
||||
|
||||
if min_pixels:
|
||||
image_processor.min_pixels = min_pixels
|
||||
if max_pixels:
|
||||
image_processor.max_pixels = max_pixels
|
||||
if max_pixels or min_pixels:
|
||||
image_processor.size = {
|
||||
"min_pixels": image_processor.min_pixels,
|
||||
"max_pixels": image_processor.max_pixels,
|
||||
}
|
||||
|
||||
return hf_processor
|
||||
|
||||
def get_image_processor(
|
||||
self,
|
||||
*,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
fps: Optional[float] = 2.0,
|
||||
) -> Union[Qwen2VLImageProcessor, Qwen2VLImageProcessorFast]:
|
||||
hf_processor = self.get_hf_processor(
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
fps=fps,
|
||||
return self.ctx.get_hf_processor(
|
||||
Qwen2_5_VLProcessor,
|
||||
image_processor=self.get_image_processor(min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
size=size),
|
||||
**kwargs,
|
||||
)
|
||||
image_processor = hf_processor.image_processor # type: ignore
|
||||
assert isinstance(image_processor,
|
||||
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
|
||||
return image_processor
|
||||
|
||||
|
||||
class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
|
||||
|
||||
@ -93,8 +93,9 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
|
||||
*,
|
||||
# Ignored in initialization
|
||||
sampling_rate: Optional[int] = None,
|
||||
**kwargs: object,
|
||||
) -> Qwen2AudioProcessor:
|
||||
return self.ctx.get_hf_processor(Qwen2AudioProcessor)
|
||||
return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)
|
||||
|
||||
def get_feature_extractor(
|
||||
self,
|
||||
|
||||
@ -31,9 +31,7 @@ import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from einops import rearrange, repeat
|
||||
from packaging.version import Version
|
||||
from transformers import BatchFeature
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
|
||||
Qwen2VLProcessor)
|
||||
from transformers.models.qwen2_vl.configuration_qwen2_vl import (
|
||||
@ -69,6 +67,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.platforms import _Backend
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.config import uses_mrope
|
||||
from vllm.transformers_utils.processor import (
|
||||
cached_image_processor_from_config)
|
||||
|
||||
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
|
||||
from .utils import (AutoWeightsLoader, WeightsMapper,
|
||||
@ -722,40 +722,64 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
|
||||
*,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
**kwargs: object,
|
||||
) -> Qwen2VLProcessor:
|
||||
hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
|
||||
image_processor = hf_processor.image_processor # type: ignore
|
||||
assert isinstance(image_processor, Qwen2VLImageProcessor)
|
||||
return self.ctx.get_hf_processor(
|
||||
Qwen2VLProcessor,
|
||||
image_processor=self.get_image_processor(min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
size=size),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if min_pixels:
|
||||
image_processor.min_pixels = min_pixels
|
||||
if max_pixels:
|
||||
image_processor.max_pixels = max_pixels
|
||||
if max_pixels or min_pixels:
|
||||
image_processor.size = {
|
||||
"min_pixels": image_processor.min_pixels,
|
||||
"max_pixels": image_processor.max_pixels,
|
||||
}
|
||||
def _get_image_processor_kwargs(
|
||||
self,
|
||||
*,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
**kwargs: object,
|
||||
):
|
||||
if self.ctx.model_config.mm_processor_kwargs:
|
||||
kwargs.update(self.ctx.model_config.mm_processor_kwargs)
|
||||
|
||||
return hf_processor
|
||||
if min_pixels is not None:
|
||||
kwargs["min_pixels"] = min_pixels
|
||||
|
||||
if size is None:
|
||||
size = {"shortest_edge": min_pixels}
|
||||
else:
|
||||
size["shortest_edge"] = min_pixels
|
||||
|
||||
if max_pixels is not None:
|
||||
kwargs["max_pixels"] = max_pixels
|
||||
|
||||
if size is None:
|
||||
size = {"longest_edge": max_pixels}
|
||||
else:
|
||||
size["longest_edge"] = max_pixels
|
||||
|
||||
if size is not None:
|
||||
kwargs["size"] = size
|
||||
|
||||
return kwargs
|
||||
|
||||
def get_image_processor(
|
||||
self,
|
||||
*,
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
size: Optional[dict[str, int]] = None,
|
||||
**kwargs: object,
|
||||
):
|
||||
hf_processor = self.get_hf_processor(min_pixels=min_pixels,
|
||||
max_pixels=max_pixels)
|
||||
image_processor = hf_processor.image_processor # type: ignore
|
||||
if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
|
||||
from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
|
||||
assert isinstance(
|
||||
image_processor,
|
||||
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
|
||||
else:
|
||||
assert isinstance(image_processor, Qwen2VLImageProcessor)
|
||||
return image_processor
|
||||
return cached_image_processor_from_config(
|
||||
self.ctx.model_config,
|
||||
**self._get_image_processor_kwargs(min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
size=size,
|
||||
**kwargs),
|
||||
)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": None}
|
||||
@ -952,6 +976,18 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
|
||||
def _get_data_parser(self) -> MultiModalDataParser:
|
||||
return Qwen2VLMultiModalDataParser()
|
||||
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
return self.info.ctx.call_hf_processor(
|
||||
self.info.get_hf_processor(**mm_kwargs),
|
||||
dict(text=prompt, **mm_data),
|
||||
self.info._get_image_processor_kwargs(**mm_kwargs),
|
||||
)
|
||||
|
||||
def _get_prompt_replacements(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
@ -964,8 +1000,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
vocab = tokenizer.get_vocab()
|
||||
|
||||
# NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
|
||||
# image_token and video_token registered
|
||||
placeholder = {
|
||||
"image": vocab[hf_processor.image_token],
|
||||
"video": vocab[hf_processor.video_token],
|
||||
|
||||
@ -519,8 +519,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
return _get_tokenizer_without_image_pad(tokenizer)
|
||||
|
||||
def get_hf_processor(self) -> QwenVLProcessor:
|
||||
return QwenVLProcessor(self.get_hf_config(), self.get_tokenizer())
|
||||
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
|
||||
return self.ctx.init_processor(
|
||||
QwenVLProcessor,
|
||||
config=self.get_hf_config(),
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
@ -68,8 +68,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
|
||||
*,
|
||||
# Ignored in initialization
|
||||
sampling_rate: Optional[int] = None,
|
||||
**kwargs: object,
|
||||
) -> ProcessorMixin:
|
||||
hf_processor = self.ctx.get_hf_processor()
|
||||
hf_processor = self.ctx.get_hf_processor(**kwargs)
|
||||
|
||||
# NOTE: Ultravox processing definition uses '<|eot_id|>' as the
|
||||
# placeholder that will cause confusion with the actual end of turn
|
||||
|
||||
@ -29,7 +29,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.audio import resample_audio
|
||||
from vllm.sequence import SequenceData
|
||||
from vllm.transformers_utils.processor import cached_get_processor
|
||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||
|
||||
from .interfaces import SupportsMultiModal, SupportsTranscription
|
||||
from .utils import AutoWeightsLoader, WeightsMapper, make_layers
|
||||
@ -579,7 +579,7 @@ def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
|
||||
mm_counts: Mapping[str, int]):
|
||||
assert mm_counts["audio"] == 1
|
||||
num_tokens = get_max_whisper_audio_tokens(ctx)
|
||||
processor = cached_get_processor(ctx.model_config.model)
|
||||
processor = cached_processor_from_config(ctx.model_config)
|
||||
chunk_length = processor.feature_extractor.chunk_length
|
||||
sampling_rate = processor.feature_extractor.sampling_rate
|
||||
num_samples = chunk_length * sampling_rate
|
||||
@ -596,7 +596,7 @@ def input_processor_for_whisper(ctx: InputContext, inputs):
|
||||
multi_modal_data["audio"] = multi_modal_data["audio"][0]
|
||||
# Resample and process audio
|
||||
audio, orig_sr = multi_modal_data["audio"]
|
||||
processor = cached_get_processor(ctx.model_config.model)
|
||||
processor = cached_processor_from_config(ctx.model_config)
|
||||
target_sr = processor.feature_extractor.sampling_rate
|
||||
audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
|
||||
multi_modal_data["audio"] = (audio, target_sr)
|
||||
@ -618,7 +618,7 @@ def input_mapper_for_whisper(
|
||||
if len(multi_modal_data) == 0:
|
||||
return MultiModalKwargs()
|
||||
|
||||
processor = cached_get_processor(ctx.model_config.model)
|
||||
processor = cached_processor_from_config(ctx.model_config)
|
||||
sampling_rate = processor.feature_extractor.sampling_rate
|
||||
|
||||
audios = [audio for audio, _ in multi_modal_data]
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import base64
|
||||
from functools import lru_cache
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Dict, Optional
|
||||
@ -11,7 +10,7 @@ from PIL import Image
|
||||
|
||||
from vllm.inputs.registry import InputContext
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import get_image_processor
|
||||
from vllm.transformers_utils.processor import cached_get_image_processor
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .base import MediaIO, MultiModalPlugin
|
||||
@ -22,8 +21,6 @@ if TYPE_CHECKING:
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
cached_get_image_processor = lru_cache(get_image_processor)
|
||||
|
||||
|
||||
class ImagePlugin(MultiModalPlugin):
|
||||
"""Plugin for image data."""
|
||||
|
||||
@ -11,7 +11,8 @@ import torch.nn as nn
|
||||
from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
|
||||
from vllm.inputs import InputProcessingContext
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
|
||||
cached_tokenizer_from_config)
|
||||
from vllm.utils import ClassRegistry
|
||||
|
||||
from .audio import AudioPlugin
|
||||
@ -21,7 +22,6 @@ from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
|
||||
from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
|
||||
ProcessingCache)
|
||||
from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
|
||||
from .utils import cached_get_tokenizer
|
||||
from .video import VideoPlugin
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -256,10 +256,7 @@ class MultiModalRegistry:
|
||||
on underlying model configuration.
|
||||
"""
|
||||
if self.has_processor(model_config):
|
||||
tokenizer = cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
processor = self.create_processor(model_config, tokenizer)
|
||||
seq_len = model_config.max_model_len
|
||||
mm_limits = self.get_mm_limits_per_prompt(model_config)
|
||||
@ -374,10 +371,7 @@ class MultiModalRegistry:
|
||||
This should be called after :meth:`init_mm_limits_per_prompt`.
|
||||
"""
|
||||
if self.has_processor(model_config):
|
||||
tokenizer = cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
processor = self.create_processor(model_config, tokenizer)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
return profiler.get_mm_limits()
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from functools import lru_cache
|
||||
from itertools import groupby
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Optional, TypeVar, Union
|
||||
@ -13,7 +12,7 @@ from PIL import Image
|
||||
import vllm.envs as envs
|
||||
from vllm.connections import HTTPConnection, global_http_connection
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .audio import AudioMediaIO
|
||||
from .base import MediaIO
|
||||
@ -23,8 +22,6 @@ from .video import VideoMediaIO
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||
|
||||
_M = TypeVar("_M")
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import base64
|
||||
from functools import lru_cache, partial
|
||||
from functools import partial
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Dict, Optional
|
||||
@ -12,8 +12,7 @@ from PIL import Image
|
||||
|
||||
from vllm.inputs.registry import InputContext
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.processor import get_video_processor
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.transformers_utils.processor import cached_get_video_processor
|
||||
from vllm.utils import PlaceholderModule, is_list_of
|
||||
|
||||
from .base import MediaIO, ModalityData
|
||||
@ -30,9 +29,6 @@ except ImportError:
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
cached_get_video_processor = lru_cache(get_video_processor)
|
||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||
|
||||
|
||||
class VideoPlugin(ImagePlugin):
|
||||
"""Plugin for video data."""
|
||||
|
||||
@ -1,25 +1,59 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Any, cast
|
||||
from typing import TYPE_CHECKING, Any, Union, cast
|
||||
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
from typing_extensions import TypeVar
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
||||
|
||||
|
||||
class HashableDict(dict):
|
||||
"""
|
||||
A dictionary that can be hashed by lru_cache.
|
||||
"""
|
||||
|
||||
# NOTE: pythonic dict is not hashable,
|
||||
# we override on it directly for simplicity
|
||||
def __hash__(self) -> int: # type: ignore[override]
|
||||
return hash(frozenset(self.items()))
|
||||
|
||||
|
||||
def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
|
||||
base_kwargs = model_config.mm_processor_kwargs
|
||||
if base_kwargs is None:
|
||||
base_kwargs = {}
|
||||
|
||||
merged_kwargs = {**base_kwargs, **kwargs}
|
||||
|
||||
# NOTE: Pythonic dict is not hashable and will raise unhashable type
|
||||
# error when calling `cached_get_processor`, therefore we need to
|
||||
# wrap it to a hashable dict.
|
||||
for key, value in merged_kwargs.items():
|
||||
if isinstance(value, dict):
|
||||
merged_kwargs[key] = HashableDict(value)
|
||||
|
||||
return merged_kwargs
|
||||
|
||||
|
||||
def get_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
trust_remote_code: bool = False,
|
||||
processor_cls: type[ProcessorMixin] = ProcessorMixin,
|
||||
processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
**kwargs: Any,
|
||||
):
|
||||
) -> _P:
|
||||
"""Load a processor for the given model name via HuggingFace."""
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoProcessor
|
||||
|
||||
processor_factory = (AutoProcessor
|
||||
if processor_cls == ProcessorMixin else processor_cls)
|
||||
processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or
|
||||
isinstance(processor_cls, tuple) else processor_cls)
|
||||
|
||||
try:
|
||||
processor = processor_factory.from_pretrained(
|
||||
@ -43,12 +77,30 @@ def get_processor(
|
||||
else:
|
||||
raise e
|
||||
|
||||
return cast(ProcessorMixin, processor)
|
||||
if not isinstance(processor, processor_cls):
|
||||
raise TypeError("Invalid type of HuggingFace processor. "
|
||||
f"Expected type: {processor_cls}, but "
|
||||
f"found type: {type(processor)}")
|
||||
|
||||
return processor
|
||||
|
||||
|
||||
cached_get_processor = lru_cache(get_processor)
|
||||
|
||||
|
||||
def cached_processor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
|
||||
**kwargs: Any,
|
||||
) -> _P:
|
||||
return cached_get_processor(
|
||||
model_config.model,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
processor_cls=processor_cls, # type: ignore[arg-type]
|
||||
**_merge_mm_kwargs(model_config, **kwargs),
|
||||
)
|
||||
|
||||
|
||||
def get_image_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
@ -85,6 +137,20 @@ def get_image_processor(
|
||||
return cast(BaseImageProcessor, processor)
|
||||
|
||||
|
||||
cached_get_image_processor = lru_cache(get_image_processor)
|
||||
|
||||
|
||||
def cached_image_processor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_image_processor(
|
||||
model_config.model,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**_merge_mm_kwargs(model_config, **kwargs),
|
||||
)
|
||||
|
||||
|
||||
def get_video_processor(
|
||||
processor_name: str,
|
||||
*args: Any,
|
||||
@ -104,3 +170,17 @@ def get_video_processor(
|
||||
)
|
||||
|
||||
return cast(BaseImageProcessor, processor.video_processor)
|
||||
|
||||
|
||||
cached_get_video_processor = lru_cache(get_video_processor)
|
||||
|
||||
|
||||
def cached_video_processor_from_config(
|
||||
model_config: "ModelConfig",
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_video_processor(
|
||||
model_config.model,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**_merge_mm_kwargs(model_config, **kwargs),
|
||||
)
|
||||
|
||||
@ -3,9 +3,10 @@
|
||||
import contextlib
|
||||
import os
|
||||
import warnings
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from types import MethodType
|
||||
from typing import Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, Optional, Union
|
||||
|
||||
import huggingface_hub
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||
@ -20,6 +21,9 @@ from vllm.transformers_utils.tokenizers import MistralTokenizer
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.utils import make_async
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
|
||||
@ -232,6 +236,22 @@ def get_tokenizer(
|
||||
return tokenizer
|
||||
|
||||
|
||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||
|
||||
|
||||
def cached_tokenizer_from_config(
|
||||
model_config: "ModelConfig",
|
||||
**kwargs: Any,
|
||||
):
|
||||
return cached_get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
tokenizer_revision=model_config.tokenizer_revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def get_lora_tokenizer(lora_request: LoRARequest, *args,
|
||||
**kwargs) -> Optional[AnyTokenizer]:
|
||||
if lora_request is None:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user