[VLM][Bugfix] Pass processor kwargs properly on init (#13516)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-02-19 21:13:50 +08:00 committed by GitHub
parent 52ce14d31f
commit 377d10bd14
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
44 changed files with 677 additions and 455 deletions

View File

@ -85,6 +85,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
trust_remote_code=True,
max_model_len=8192,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"max_dynamic_patch": 4},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"

View File

@ -10,7 +10,7 @@ from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import ProcessingCache
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import HF_EXAMPLE_MODELS
@ -42,10 +42,7 @@ def _test_processing_correctness(
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
ctx = InputProcessingContext(
model_config,
tokenizer=cached_get_tokenizer(
model_config.tokenizer,
trust_remote_code=model_info.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(model_config),
)
# Ensure that it can fit all of the data
cache = ProcessingCache(capacity=1 << 30)

View File

@ -1,17 +1,118 @@
# SPDX-License-Identifier: Apache-2.0
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from typing import Optional
from typing import Mapping, Optional
import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets
from ...utils import build_model_context
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
get_h2ovl_target_ratios)
width, height = image.size
# Calculate the expected number of blocks
if num_imgs == 1 and config.use_msac:
# First pass
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num=1,
max_num=max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False, # Thumbnail is handled separately
)
# Second pass
blocks2, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num=3,
max_num=max_num,
prior_aspect_ratio=aspect_ratio,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if config.use_thumbnail:
blocks1 += 1 if blocks1 > 1 else 0
blocks2 += 1 if blocks2 > 1 else 0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks = blocks1 + blocks2 - 1
return total_blocks
blocks, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
processed_inputs = processor.apply("<image>" * len(images), mm_data,
mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", [
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
@ -25,118 +126,54 @@ from ...utils import build_model_context
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
model_id: str,
image_assets: _ImageAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: Optional[bool],
num_imgs: int,
kwargs_on_init: bool,
):
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
get_h2ovl_target_ratios)
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
config = processor.info.get_hf_config()
use_msac = config.use_msac
mm_processor_kwargs = {
"max_dynamic_patch": max_dynamic_patch,
}
if dynamic_image_size is not None:
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
min_num = config.min_dynamic_patch
min_num = min_dynamic_patch if dynamic_image_size else 1
max_num = max_dynamic_patch if dynamic_image_size else 1
# Build the image str / prompt based on the number of images we pass
prompt = "<image>" * num_imgs
for asset in image_assets:
for factor in size_factors:
image = rescale_image_size(asset.pil_image, factor)
mm_data = {"image": [image] * num_imgs}
width, height = image.size
# Calculate the expected number of blocks
if num_imgs == 1 and use_msac:
# First pass
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False, # Thumbnail is handled separately
)
# Second pass
blocks2, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=aspect_ratio,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if config.use_thumbnail:
blocks1 += 1 if blocks1 > 1 else 0
blocks2 += 1 if blocks2 > 1 else 0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks = blocks1 + blocks2 - 1
expected_num_patches = total_blocks
else:
blocks, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches != 1:
expected_num_patches += 1
processed_inputs = processor.apply(prompt, mm_data,
mm_processor_kwargs)
pixel_shape = (
processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
assert pixel_shape[0] == expected_num_patches * num_imgs
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
min_num,
max_num,
hf_processor_mm_kwargs,
)

View File

@ -4,7 +4,7 @@ import pytest
from transformers import Idefics3Config
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets
from ...utils import build_model_context
@ -22,9 +22,15 @@ models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
])
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(image_assets: _ImageAssets, model: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int, num_imgs: int):
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
model: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
@ -33,15 +39,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
mm_processor_kwargs=None,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders = "<image>" if num_imgs == 1 else "\n".join(
@ -54,8 +60,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
"input_ids"][0]

View File

@ -1,64 +1,136 @@
# SPDX-License-Identifier: Apache-2.0
"""Tests for InternVL's multimodal preprocessing kwargs."""
from typing import Optional
from typing import Mapping, Optional
import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(
model_id: str,
image_assets: _ImageAssets,
max_dynamic_patch: int,
dynamic_image_size: Optional[bool],
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
from vllm.model_executor.models.internvl import (
calculate_internvl_targets, get_internvl_target_ratios)
width, height = image.size
blocks, _, _ = calculate_internvl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_internvl_target_ratios(
min_num,
max_num,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
mm_processor_kwargs = {
"max_dynamic_patch": max_dynamic_patch,
}
if dynamic_image_size is not None:
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
# Build the image str / prompt based on the number of images we pass
prompt = "<image>" * num_imgs
image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
mm_data = {"image": [image] * num_imgs}
return expected_num_patches
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
if dynamic_image_size is False:
expected_num_patches = 1
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
processed_inputs = processor.apply("<image>" * len(images), mm_data,
mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * expected_num_patches * num_imgs
assert pixel_shape[0] == expected_num_patches * num_imgs
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
model_id: str,
image_assets: _ImageAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: Optional[bool],
kwargs_on_init: bool,
):
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
min_num = min_dynamic_patch if dynamic_image_size else 1
max_num = max_dynamic_patch if dynamic_image_size else 1
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
min_num,
max_num,
hf_processor_mm_kwargs,
)

View File

@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ...utils import build_model_context
@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
info = processor.info
@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
seen_aspect_ratios = set[float]()

View File

@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ...utils import build_model_context
@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
info = processor.info
@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
seen_aspect_ratios = set[float]()

View File

@ -3,7 +3,7 @@
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets
from ...utils import build_model_context
@ -21,12 +21,14 @@ from ...utils import build_model_context
])
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
model_id: str,
mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure input_processor_for_phi3v handles num_crops properly."""
# Avoid initializing CUDA early
@ -36,23 +38,22 @@ def test_processor_override(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)

View File

@ -3,7 +3,7 @@
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets
from ...utils import build_model_context
@ -18,6 +18,7 @@ from ...utils import build_model_context
])
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
model_id: str,
@ -25,31 +26,30 @@ def test_processor_override(
expected_toks_per_img: int,
expected_pixels_shape: tuple[int, int],
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape

View File

@ -248,13 +248,16 @@ def check_logprobs_close(
warnings.warn(fail_msg, stacklevel=2)
def build_model_context(model_name: str,
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None,
trust_remote_code: bool = False,
dtype: Optional[Union[str, torch.dtype]] = None,
mm_processor_kwargs: Optional[Dict] = None,
limit_mm_per_prompt: Optional[Dict] = None):
def build_model_context(
model_name: str,
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None,
trust_remote_code: bool = False,
dtype: Optional[Union[str, torch.dtype]] = None,
mm_processor_kwargs: Optional[Dict] = None,
limit_mm_per_prompt: Optional[Dict] = None,
disable_mm_preprocessor_cache: bool = True,
):
"""Creates an InputContext for a given model.
Args:
@ -283,5 +286,6 @@ def build_model_context(model_name: str,
seed=0,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
)
return InputContext(model_config)

View File

@ -22,8 +22,8 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
replace_token_matches)
# yapf: enable
from vllm.multimodal.profiling import MultiModalProfiler
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
cached_tokenizer_from_config)
from vllm.utils import full_groupby
from .utils import random_image
@ -577,7 +577,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
processor = MULTIMODAL_REGISTRY.create_processor(
model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer),
tokenizer=cached_tokenizer_from_config(model_config),
)
profiler = MultiModalProfiler(processor)
@ -617,7 +617,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
processor = MULTIMODAL_REGISTRY.create_processor(
model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer),
tokenizer=cached_tokenizer_from_config(model_config),
)
rng = np.random.RandomState(0)
@ -689,7 +689,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
processor = MULTIMODAL_REGISTRY.create_processor(
model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer),
tokenizer=cached_tokenizer_from_config(model_config),
)
orig_get_hf_processor = processor.info.get_hf_processor

View File

@ -11,8 +11,9 @@ from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
from typing_extensions import TypeVar, assert_never
from vllm.logger import init_logger
from vllm.transformers_utils.processor import cached_get_processor
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
cached_tokenizer_from_config)
from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
resolve_mm_processor_kwargs)
@ -27,19 +28,9 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig)
P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin)
class HashableDict(dict):
"""
A dictionary that can be hashed by lru_cache.
"""
# NOTE: pythonic dict is not hashable,
# we override on it directly for simplicity
def __hash__(self) -> int: # type: ignore[override]
return hash(frozenset(self.items()))
_T = TypeVar("_T")
_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
@dataclass(frozen=True)
@ -54,9 +45,9 @@ class InputContext:
def get_hf_config(
self,
typ: Union[type[C], tuple[type[C], ...]] = PretrainedConfig,
typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig,
/,
) -> C:
) -> _C:
"""
Get the HuggingFace configuration
(:class:`transformers.PretrainedConfig`) of the model,
@ -94,10 +85,10 @@ class InputContext:
def get_hf_processor(
self,
typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
/,
**kwargs: object,
) -> P:
) -> _P:
"""
Get the HuggingFace processor
(:class:`transformers.ProcessorMixin`) of the model,
@ -106,33 +97,29 @@ class InputContext:
Raises:
TypeError: If the processor is not of the specified type.
"""
return cached_processor_from_config(
self.model_config,
processor_cls=typ,
**kwargs,
)
def init_processor(
self,
typ: type[_T],
/,
**kwargs: object,
) -> _T:
"""
Initialize a HuggingFace-like processor class, merging the
keyword arguments with those in the model's configuration.
"""
base_kwargs = self.model_config.mm_processor_kwargs
if base_kwargs is None:
base_kwargs = {}
merged_kwargs = {**base_kwargs, **kwargs}
if isinstance(typ, type):
merged_kwargs["processor_cls"] = typ
# NOTE: Pythonic dict is not hashable and will raise unhashable type
# error when calling `cached_get_processor`, therefore we need to
# wrap it to a hashable dict.
for key, value in merged_kwargs.items():
if isinstance(value, dict):
merged_kwargs[key] = HashableDict(value)
hf_processor = cached_get_processor(
self.model_config.model,
trust_remote_code=self.model_config.trust_remote_code,
**merged_kwargs,
)
if not isinstance(hf_processor, typ):
raise TypeError("Invalid type of HuggingFace processor. "
f"Expected type: {typ}, but "
f"found type: {type(hf_processor)}")
return hf_processor
return typ(**merged_kwargs)
@dataclass(frozen=True)
@ -142,10 +129,10 @@ class InputProcessingContext(InputContext):
def get_hf_processor(
self,
typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
/,
**kwargs: object,
) -> P:
) -> _P:
return super().get_hf_processor(
typ,
tokenizer=self.tokenizer,
@ -341,13 +328,9 @@ class InputRegistry:
from vllm.model_executor.model_loader import get_model_architecture
from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.profiling import MultiModalProfiler
from vllm.multimodal.utils import cached_get_tokenizer
if mm_registry.has_processor(model_config):
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
tokenizer = cached_tokenizer_from_config(model_config)
processor = mm_registry.create_processor(model_config, tokenizer)
profiler = MultiModalProfiler(processor)
dummy_data = profiler.get_dummy_data(

View File

@ -400,8 +400,8 @@ class AriaProcessingInfo(BaseProcessingInfo):
def get_vision_config(self):
return self.get_hf_config().vision_config
def get_hf_processor(self):
return self.ctx.get_hf_processor(AriaProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(AriaProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}

View File

@ -58,8 +58,8 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config(ChameleonConfig)
def get_hf_processor(self):
return self.ctx.get_hf_processor(ChameleonProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(ChameleonProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1}

View File

@ -28,13 +28,13 @@ from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
MlpProjectorConfig,
VisionEncoderConfig)
from vllm.transformers_utils.processors.deepseek_vl2 import (
DeepseekVLV2Processor)
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils import is_list_of
from .interfaces import SupportsMultiModal, SupportsPP
@ -133,8 +133,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config(DeepseekVLV2Config)
def get_hf_processor(self) -> DeepseekVLV2Processor:
return self.ctx.get_hf_processor(DeepseekVLV2Processor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(DeepseekVLV2Processor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
@ -308,13 +308,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self.text_config = config.text_config
model_config = vllm_config.model_config
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
tokenizer_mode=model_config.tokenizer_mode,
tokenizer_revision=model_config.tokenizer_revision,
trust_remote_code=model_config.trust_remote_code,
)
self.image_token_id = tokenizer.vocab.get(_IMAGE_TOKEN)
tokenizer = cached_tokenizer_from_config(model_config)
self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
self.vision = self._init_vision_module(self.vision_config,
quant_config,

View File

@ -71,8 +71,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config(FuyuConfig)
def get_hf_processor(self):
return self.ctx.get_hf_processor(FuyuProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(FuyuProcessor, **kwargs)
def get_image_processor(self) -> FuyuImageProcessor:
return self.get_hf_processor().image_processor

View File

@ -416,18 +416,15 @@ class GLM4VProcessor:
class GLM4VProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self):
tokenizer = self.ctx.tokenizer
assert isinstance(tokenizer, PreTrainedTokenizer)
return tokenizer
def get_hf_config(self):
return self.ctx.get_hf_config(ChatGLMConfig)
def get_hf_processor(self) -> GLM4VProcessor:
return GLM4VProcessor(
self.get_hf_config(),
self.get_tokenizer(),
def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
return self.ctx.init_processor(
GLM4VProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:

View File

@ -15,9 +15,9 @@ from vllm.model_executor.layers.pooler import PoolerHead
from vllm.model_executor.models.llama import LlamaForCausalLM
from vllm.model_executor.pooling_metadata import (PoolingMetadata,
PoolingTensors)
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (IntermediateTensors, PoolerOutput,
PoolingSequenceGroupOutput)
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
logger = init_logger(__name__)
@ -29,12 +29,7 @@ class GritLMPooler(nn.Module):
self.model_config = model_config
tokenizer = cached_get_tokenizer(
self.model_config.tokenizer,
tokenizer_mode=self.model_config.tokenizer_mode,
tokenizer_revision=self.model_config.tokenizer_revision,
trust_remote_code=self.model_config.trust_remote_code,
)
tokenizer = cached_tokenizer_from_config(self.model_config)
# Collect the tokens needed for pattern matching.
# "▁<" is different from "_<". The former uses "▁" to indicate that

View File

@ -41,6 +41,7 @@ def resolve_h2ovl_min_max_num(
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
@ -190,7 +191,7 @@ def image_to_pixel_values_h2ovl(
pixel_values1, aspect_ratio1 = _preprocess_image(
image,
input_size=input_size,
min_num=min_num,
min_num=1,
max_num=max_num,
use_thumbnail=True,
prior_aspect_ratio=None,
@ -199,7 +200,7 @@ def image_to_pixel_values_h2ovl(
pixel_values2, _ = _preprocess_image(
image,
input_size=input_size,
min_num=3, # Hardcoded value
min_num=3,
max_num=max_num,
use_thumbnail=True,
prior_aspect_ratio=aspect_ratio1,
@ -228,6 +229,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
config: PretrainedConfig,
tokenizer: AnyTokenizer,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
use_msac: Optional[bool] = None,
@ -235,6 +237,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
super().__init__(
config,
tokenizer,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
@ -267,11 +270,13 @@ class H2OVLProcessor(BaseInternVLProcessor):
def resolve_min_max_num(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
use_thumbnail: Optional[bool] = None,
) -> tuple[int, int]:
min_dynamic_patch = self.min_dynamic_patch
min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
is None else min_dynamic_patch)
max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
is None else max_dynamic_patch)
dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
@ -289,18 +294,21 @@ class H2OVLProcessor(BaseInternVLProcessor):
def resolve_target_ratios(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
use_thumbnail: Optional[bool] = None,
prior_aspect_ratio: Optional[tuple[int, int]] = None,
override_min_num: Optional[int] = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
if prior_aspect_ratio: # hardcoded value for second pass of use_msac
min_num = 3
if override_min_num is not None:
min_num = override_min_num
return get_h2ovl_target_ratios(
min_num,
@ -322,6 +330,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
if use_msac:
target_ratios_1 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
override_min_num=1,
)
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
orig_width=image_width,
@ -334,6 +343,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
target_ratios_2 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
prior_aspect_ratio=aspect_ratio_1,
override_min_num=3,
)
num_patches_2, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
@ -361,12 +371,14 @@ class H2OVLProcessor(BaseInternVLProcessor):
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
) -> list[torch.Tensor]:
use_msac = self.use_msac if len(images) == 1 else False
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
@ -389,14 +401,23 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> H2OVLProcessor:
return H2OVLProcessor(
self.get_hf_config(),
self.get_tokenizer(),
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
if min_dynamic_patch is not None:
kwargs["min_dynamic_patch"] = min_dynamic_patch
if max_dynamic_patch is not None:
kwargs["max_dynamic_patch"] = max_dynamic_patch
if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
return self.ctx.init_processor(
H2OVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)
def get_mm_max_tokens_per_item(

View File

@ -83,13 +83,15 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
class Idefics3ProcessingInfo(BaseProcessingInfo):
def get_hf_processor(
self,
*,
size: Optional[Dict[str, int]] = None) -> Idefics3Processor:
self,
*,
size: Optional[Dict[str, int]] = None,
**kwargs: object,
) -> Idefics3Processor:
if size is not None:
return self.ctx.get_hf_processor(Idefics3Processor, size=size)
kwargs["size"] = size
return self.ctx.get_hf_processor(Idefics3Processor)
return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}

View File

@ -120,6 +120,7 @@ def resolve_internvl_min_max_num(
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
@ -247,6 +248,7 @@ class BaseInternVLProcessor(ABC):
config: PretrainedConfig,
tokenizer: AnyTokenizer,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
) -> None:
@ -258,18 +260,22 @@ class BaseInternVLProcessor(ABC):
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size)**2 * (config.downsample_ratio**2))
self.image_size = image_size
self.min_dynamic_patch: int = config.min_dynamic_patch
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@ -298,11 +304,13 @@ class BaseInternVLProcessor(ABC):
def resolve_min_max_num(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
use_thumbnail: Optional[bool] = None,
) -> tuple[int, int]:
min_dynamic_patch = self.min_dynamic_patch
min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
is None else min_dynamic_patch)
max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
is None else max_dynamic_patch)
dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
@ -320,11 +328,13 @@ class BaseInternVLProcessor(ABC):
def resolve_target_ratios(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
use_thumbnail: Optional[bool] = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
@ -355,10 +365,12 @@ class BaseInternVLProcessor(ABC):
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
@ -378,6 +390,7 @@ class BaseInternVLProcessor(ABC):
self,
text: Optional[Union[str, list[str]]] = None,
images: Optional[Union[Image.Image, list[Image.Image]]] = None,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
@ -396,6 +409,7 @@ class BaseInternVLProcessor(ABC):
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
@ -451,8 +465,10 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> BaseInternVLProcessor:
raise NotImplementedError
@ -642,14 +658,23 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> InternVLProcessor:
return InternVLProcessor(
self.get_hf_config(),
self.get_tokenizer(),
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
if min_dynamic_patch is not None:
kwargs["min_dynamic_patch"] = min_dynamic_patch
if max_dynamic_patch is not None:
kwargs["max_dynamic_patch"] = max_dynamic_patch
if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
return self.ctx.init_processor(
InternVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)

View File

@ -119,7 +119,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
return get_vision_encoder_info(self.get_hf_config())
@abstractmethod
def get_hf_processor(self) -> LlavaLikeProcessor:
def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
raise NotImplementedError
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
@ -208,8 +208,8 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
class LlavaProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_processor(self):
return self.ctx.get_hf_processor(LlavaProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
@ -272,8 +272,8 @@ class LlavaMultiModalProcessor(
class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_processor(self):
return self.ctx.get_hf_processor(PixtralProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
class PixtralHFMultiModalProcessor(
@ -742,23 +742,24 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
class MantisProcessingInfo(LlavaProcessingInfo):
def get_hf_processor(self):
def get_hf_processor(self, **kwargs: object):
hf_config = self.get_hf_config()
vision_info = self.get_vision_encoder_info()
kwargs.setdefault("patch_size", vision_info.get_patch_size())
if Version(TRANSFORMERS_VERSION) < Version("4.48"):
# BUG: num_additional_image_tokens = 0 but treated as 1,
# so we set vision_feature_select_strategy to None to offset this
vision_feature_select_strategy = None
kwargs.setdefault("vision_feature_select_strategy", None)
else:
# FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
vision_feature_select_strategy = hf_config.vision_feature_select_strategy # noqa: E501
kwargs.setdefault(
"vision_feature_select_strategy",
hf_config.vision_feature_select_strategy,
)
return self.ctx.get_hf_processor(
LlavaProcessor,
patch_size=vision_info.get_patch_size(),
vision_feature_select_strategy=vision_feature_select_strategy,
)
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
class MantisMultiModalProcessor(LlavaMultiModalProcessor):

View File

@ -72,8 +72,8 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
def get_hf_config(self) -> LlavaNextLikeConfig:
return self.ctx.get_hf_config(LlavaNextConfig)
def get_hf_processor(self):
hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor)
def get_hf_processor(self, **kwargs: object):
hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor, **kwargs)
# In case patch_size is omitted from `processor_config.json`
# e.g. for E5-V: https://huggingface.co/royokong/e5-v

View File

@ -56,8 +56,8 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
def get_vision_encoder_info(self):
return get_vision_encoder_info(self.get_hf_config())
def get_hf_processor(self):
return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(LlavaNextVideoProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"video": 1}

View File

@ -97,8 +97,8 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
def get_hf_config(self) -> LlavaOnevisionLikeConfig:
return self.ctx.get_hf_config(LlavaOnevisionConfig)
def get_hf_processor(self):
return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
def get_hf_processor(self, **kwargs: object):
return self.ctx.get_hf_processor(LlavaOnevisionProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None}

View File

@ -331,11 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
def get_hf_config(self):
return self.ctx.get_hf_config()
def get_hf_processor(
self,
**kwargs: object,
):
hf_processor = self.ctx.get_hf_processor()
def get_hf_processor(self, **kwargs: object):
hf_processor = self.ctx.get_hf_processor(**kwargs)
# NumPy arrays are considered as Iterable but not Sequence in
# https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428

View File

@ -94,8 +94,8 @@ class MllamaProcessingInfo(BaseProcessingInfo):
def get_hf_config(self) -> MllamaConfig:
return self.ctx.get_hf_config(MllamaConfig)
def get_hf_processor(self) -> MllamaProcessor:
return self.ctx.get_hf_processor(MllamaProcessor)
def get_hf_processor(self, **kwargs: object) -> MllamaProcessor:
return self.ctx.get_hf_processor(MllamaProcessor, **kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}

View File

@ -1200,8 +1200,8 @@ class MolmoProcessorWrapper:
class MolmoProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self) -> MolmoProcessorWrapper:
processor = self.ctx.get_hf_processor()
def get_hf_processor(self, **kwargs: object) -> MolmoProcessorWrapper:
processor = self.ctx.get_hf_processor(**kwargs)
return MolmoProcessorWrapper(processor)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:

View File

@ -69,14 +69,23 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
def get_hf_processor(
self,
*,
min_dynamic_patch: Optional[int] = None,
max_dynamic_patch: Optional[int] = None,
dynamic_image_size: Optional[bool] = None,
**kwargs: object,
) -> NVLMProcessor:
return NVLMProcessor(
self.get_hf_config(),
self.get_tokenizer(),
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
if min_dynamic_patch is not None:
kwargs["min_dynamic_patch"] = min_dynamic_patch
if max_dynamic_patch is not None:
kwargs["max_dynamic_patch"] = max_dynamic_patch
if dynamic_image_size is not None:
kwargs["dynamic_image_size"] = dynamic_image_size
return self.ctx.init_processor(
NVLMProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)
def get_max_image_tokens(self) -> int:

View File

@ -16,8 +16,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from .interfaces import SupportsMultiModal, SupportsPP
from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
@ -88,7 +88,7 @@ def input_processor_for_paligemma(ctx: InputContext,
model_config = ctx.model_config
hf_config = ctx.get_hf_config(PaliGemmaConfig)
tokenizer = cached_get_tokenizer(model_config.tokenizer)
tokenizer = cached_tokenizer_from_config(model_config)
image_feature_size = hf_config.text_config.num_image_tokens
image_token_str = tokenizer.decode(hf_config.image_token_index)
bos_token = tokenizer.decode(hf_config.bos_token_id)

View File

@ -313,11 +313,12 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
self,
*,
num_crops: Optional[int] = None,
**kwargs: object,
) -> ProcessorMixin:
if num_crops is not None:
return self.ctx.get_hf_processor(num_crops=num_crops)
kwargs["num_crops"] = num_crops
return self.ctx.get_hf_processor()
return self.ctx.get_hf_processor(**kwargs)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}

View File

@ -32,9 +32,9 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges)
from vllm.multimodal.utils import consecutive_placeholder_ranges
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from .interfaces import SupportsMultiModal, SupportsPP
from .utils import (init_vllm_registered_model, maybe_prefix,
@ -49,9 +49,7 @@ except ImportError:
def get_max_pixtral_image_tokens(ctx: InputContext):
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
tokenizer_mode=ctx.model_config.tokenizer_mode)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
mm_encoder = tokenizer.instruct.mm_encoder
image_config = mm_encoder.mm_config if hasattr(
@ -65,9 +63,7 @@ def get_max_pixtral_image_tokens(ctx: InputContext):
def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
tokenizer_mode=ctx.model_config.tokenizer_mode)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
image_token_id = mm_encoder.special_ids.img
@ -109,9 +105,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
MultiModalKwargs containing the stacked normalized images tensor or
image embeddings.
"""
model_config = ctx.model_config
tokenizer = cached_get_tokenizer(
model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
data_list = data if isinstance(data, list) else [data]
@ -138,9 +132,7 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
prompt_token_ids = inputs.get("prompt_token_ids")
prompt = inputs.get("prompt")
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
tokenizer_mode=ctx.model_config.tokenizer_mode)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
image_token_id = mm_encoder.special_ids.img

View File

@ -36,8 +36,6 @@ from transformers import BatchFeature
from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
Qwen2VLImageProcessorFast)
from vllm.attention import AttentionMetadata
from vllm.config import VllmConfig
@ -690,41 +688,20 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
fps: Optional[float] = 2.0,
size: Optional[dict[str, int]] = None,
fps: Optional[float] = None,
**kwargs: object,
) -> Qwen2_5_VLProcessor:
hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor)
image_processor = hf_processor.image_processor # type: ignore
assert isinstance(image_processor,
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
if fps is not None:
kwargs["fps"] = fps
if min_pixels:
image_processor.min_pixels = min_pixels
if max_pixels:
image_processor.max_pixels = max_pixels
if max_pixels or min_pixels:
image_processor.size = {
"min_pixels": image_processor.min_pixels,
"max_pixels": image_processor.max_pixels,
}
return hf_processor
def get_image_processor(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
fps: Optional[float] = 2.0,
) -> Union[Qwen2VLImageProcessor, Qwen2VLImageProcessorFast]:
hf_processor = self.get_hf_processor(
min_pixels=min_pixels,
max_pixels=max_pixels,
fps=fps,
return self.ctx.get_hf_processor(
Qwen2_5_VLProcessor,
image_processor=self.get_image_processor(min_pixels=min_pixels,
max_pixels=max_pixels,
size=size),
**kwargs,
)
image_processor = hf_processor.image_processor # type: ignore
assert isinstance(image_processor,
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
return image_processor
class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):

View File

@ -93,8 +93,9 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
*,
# Ignored in initialization
sampling_rate: Optional[int] = None,
**kwargs: object,
) -> Qwen2AudioProcessor:
return self.ctx.get_hf_processor(Qwen2AudioProcessor)
return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)
def get_feature_extractor(
self,

View File

@ -31,9 +31,7 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from packaging.version import Version
from transformers import BatchFeature
from transformers import __version__ as TRANSFORMERS_VERSION
from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
Qwen2VLProcessor)
from transformers.models.qwen2_vl.configuration_qwen2_vl import (
@ -69,6 +67,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.platforms import _Backend
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import uses_mrope
from vllm.transformers_utils.processor import (
cached_image_processor_from_config)
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper,
@ -722,40 +722,64 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
) -> Qwen2VLProcessor:
hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
image_processor = hf_processor.image_processor # type: ignore
assert isinstance(image_processor, Qwen2VLImageProcessor)
return self.ctx.get_hf_processor(
Qwen2VLProcessor,
image_processor=self.get_image_processor(min_pixels=min_pixels,
max_pixels=max_pixels,
size=size),
**kwargs,
)
if min_pixels:
image_processor.min_pixels = min_pixels
if max_pixels:
image_processor.max_pixels = max_pixels
if max_pixels or min_pixels:
image_processor.size = {
"min_pixels": image_processor.min_pixels,
"max_pixels": image_processor.max_pixels,
}
def _get_image_processor_kwargs(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
if self.ctx.model_config.mm_processor_kwargs:
kwargs.update(self.ctx.model_config.mm_processor_kwargs)
return hf_processor
if min_pixels is not None:
kwargs["min_pixels"] = min_pixels
if size is None:
size = {"shortest_edge": min_pixels}
else:
size["shortest_edge"] = min_pixels
if max_pixels is not None:
kwargs["max_pixels"] = max_pixels
if size is None:
size = {"longest_edge": max_pixels}
else:
size["longest_edge"] = max_pixels
if size is not None:
kwargs["size"] = size
return kwargs
def get_image_processor(
self,
*,
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
size: Optional[dict[str, int]] = None,
**kwargs: object,
):
hf_processor = self.get_hf_processor(min_pixels=min_pixels,
max_pixels=max_pixels)
image_processor = hf_processor.image_processor # type: ignore
if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
assert isinstance(
image_processor,
(Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
else:
assert isinstance(image_processor, Qwen2VLImageProcessor)
return image_processor
return cached_image_processor_from_config(
self.ctx.model_config,
**self._get_image_processor_kwargs(min_pixels=min_pixels,
max_pixels=max_pixels,
size=size,
**kwargs),
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None}
@ -952,6 +976,18 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
def _get_data_parser(self) -> MultiModalDataParser:
return Qwen2VLMultiModalDataParser()
def _call_hf_processor(
self,
prompt: str,
mm_data: Mapping[str, object],
mm_kwargs: Mapping[str, object],
) -> BatchFeature:
return self.info.ctx.call_hf_processor(
self.info.get_hf_processor(**mm_kwargs),
dict(text=prompt, **mm_data),
self.info._get_image_processor_kwargs(**mm_kwargs),
)
def _get_prompt_replacements(
self,
mm_items: MultiModalDataItems,
@ -964,8 +1000,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
tokenizer = self.info.get_tokenizer()
vocab = tokenizer.get_vocab()
# NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
# image_token and video_token registered
placeholder = {
"image": vocab[hf_processor.image_token],
"video": vocab[hf_processor.video_token],

View File

@ -519,8 +519,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
return _get_tokenizer_without_image_pad(tokenizer)
def get_hf_processor(self) -> QwenVLProcessor:
return QwenVLProcessor(self.get_hf_config(), self.get_tokenizer())
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
return self.ctx.init_processor(
QwenVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
**kwargs,
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}

View File

@ -68,8 +68,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
*,
# Ignored in initialization
sampling_rate: Optional[int] = None,
**kwargs: object,
) -> ProcessorMixin:
hf_processor = self.ctx.get_hf_processor()
hf_processor = self.ctx.get_hf_processor(**kwargs)
# NOTE: Ultravox processing definition uses '<|eot_id|>' as the
# placeholder that will cause confusion with the actual end of turn

View File

@ -29,7 +29,7 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.audio import resample_audio
from vllm.sequence import SequenceData
from vllm.transformers_utils.processor import cached_get_processor
from vllm.transformers_utils.processor import cached_processor_from_config
from .interfaces import SupportsMultiModal, SupportsTranscription
from .utils import AutoWeightsLoader, WeightsMapper, make_layers
@ -579,7 +579,7 @@ def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
assert mm_counts["audio"] == 1
num_tokens = get_max_whisper_audio_tokens(ctx)
processor = cached_get_processor(ctx.model_config.model)
processor = cached_processor_from_config(ctx.model_config)
chunk_length = processor.feature_extractor.chunk_length
sampling_rate = processor.feature_extractor.sampling_rate
num_samples = chunk_length * sampling_rate
@ -596,7 +596,7 @@ def input_processor_for_whisper(ctx: InputContext, inputs):
multi_modal_data["audio"] = multi_modal_data["audio"][0]
# Resample and process audio
audio, orig_sr = multi_modal_data["audio"]
processor = cached_get_processor(ctx.model_config.model)
processor = cached_processor_from_config(ctx.model_config)
target_sr = processor.feature_extractor.sampling_rate
audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
multi_modal_data["audio"] = (audio, target_sr)
@ -618,7 +618,7 @@ def input_mapper_for_whisper(
if len(multi_modal_data) == 0:
return MultiModalKwargs()
processor = cached_get_processor(ctx.model_config.model)
processor = cached_processor_from_config(ctx.model_config)
sampling_rate = processor.feature_extractor.sampling_rate
audios = [audio for audio, _ in multi_modal_data]

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
import base64
from functools import lru_cache
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Optional
@ -11,7 +10,7 @@ from PIL import Image
from vllm.inputs.registry import InputContext
from vllm.logger import init_logger
from vllm.transformers_utils.processor import get_image_processor
from vllm.transformers_utils.processor import cached_get_image_processor
from vllm.utils import is_list_of
from .base import MediaIO, MultiModalPlugin
@ -22,8 +21,6 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
cached_get_image_processor = lru_cache(get_image_processor)
class ImagePlugin(MultiModalPlugin):
"""Plugin for image data."""

View File

@ -11,7 +11,8 @@ import torch.nn as nn
from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
from vllm.inputs import InputProcessingContext
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
cached_tokenizer_from_config)
from vllm.utils import ClassRegistry
from .audio import AudioPlugin
@ -21,7 +22,6 @@ from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
ProcessingCache)
from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
from .utils import cached_get_tokenizer
from .video import VideoPlugin
if TYPE_CHECKING:
@ -256,10 +256,7 @@ class MultiModalRegistry:
on underlying model configuration.
"""
if self.has_processor(model_config):
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
tokenizer = cached_tokenizer_from_config(model_config)
processor = self.create_processor(model_config, tokenizer)
seq_len = model_config.max_model_len
mm_limits = self.get_mm_limits_per_prompt(model_config)
@ -374,10 +371,7 @@ class MultiModalRegistry:
This should be called after :meth:`init_mm_limits_per_prompt`.
"""
if self.has_processor(model_config):
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
tokenizer = cached_tokenizer_from_config(model_config)
processor = self.create_processor(model_config, tokenizer)
profiler = MultiModalProfiler(processor)
return profiler.get_mm_limits()

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
from functools import lru_cache
from itertools import groupby
from pathlib import Path
from typing import TYPE_CHECKING, Optional, TypeVar, Union
@ -13,7 +12,7 @@ from PIL import Image
import vllm.envs as envs
from vllm.connections import HTTPConnection, global_http_connection
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
from vllm.transformers_utils.tokenizer import AnyTokenizer
from .audio import AudioMediaIO
from .base import MediaIO
@ -23,8 +22,6 @@ from .video import VideoMediaIO
logger = init_logger(__name__)
cached_get_tokenizer = lru_cache(get_tokenizer)
_M = TypeVar("_M")
if TYPE_CHECKING:

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
import base64
from functools import lru_cache, partial
from functools import partial
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Optional
@ -12,8 +12,7 @@ from PIL import Image
from vllm.inputs.registry import InputContext
from vllm.logger import init_logger
from vllm.transformers_utils.processor import get_video_processor
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.transformers_utils.processor import cached_get_video_processor
from vllm.utils import PlaceholderModule, is_list_of
from .base import MediaIO, ModalityData
@ -30,9 +29,6 @@ except ImportError:
logger = init_logger(__name__)
cached_get_video_processor = lru_cache(get_video_processor)
cached_get_tokenizer = lru_cache(get_tokenizer)
class VideoPlugin(ImagePlugin):
"""Plugin for video data."""

View File

@ -1,25 +1,59 @@
# SPDX-License-Identifier: Apache-2.0
from functools import lru_cache
from typing import Any, cast
from typing import TYPE_CHECKING, Any, Union, cast
from transformers.processing_utils import ProcessorMixin
from typing_extensions import TypeVar
if TYPE_CHECKING:
from vllm.config import ModelConfig
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
class HashableDict(dict):
"""
A dictionary that can be hashed by lru_cache.
"""
# NOTE: pythonic dict is not hashable,
# we override on it directly for simplicity
def __hash__(self) -> int: # type: ignore[override]
return hash(frozenset(self.items()))
def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
base_kwargs = model_config.mm_processor_kwargs
if base_kwargs is None:
base_kwargs = {}
merged_kwargs = {**base_kwargs, **kwargs}
# NOTE: Pythonic dict is not hashable and will raise unhashable type
# error when calling `cached_get_processor`, therefore we need to
# wrap it to a hashable dict.
for key, value in merged_kwargs.items():
if isinstance(value, dict):
merged_kwargs[key] = HashableDict(value)
return merged_kwargs
def get_processor(
processor_name: str,
*args: Any,
trust_remote_code: bool = False,
processor_cls: type[ProcessorMixin] = ProcessorMixin,
processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
**kwargs: Any,
):
) -> _P:
"""Load a processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from transformers import AutoProcessor
processor_factory = (AutoProcessor
if processor_cls == ProcessorMixin else processor_cls)
processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or
isinstance(processor_cls, tuple) else processor_cls)
try:
processor = processor_factory.from_pretrained(
@ -43,12 +77,30 @@ def get_processor(
else:
raise e
return cast(ProcessorMixin, processor)
if not isinstance(processor, processor_cls):
raise TypeError("Invalid type of HuggingFace processor. "
f"Expected type: {processor_cls}, but "
f"found type: {type(processor)}")
return processor
cached_get_processor = lru_cache(get_processor)
def cached_processor_from_config(
model_config: "ModelConfig",
processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
**kwargs: Any,
) -> _P:
return cached_get_processor(
model_config.model,
trust_remote_code=model_config.trust_remote_code,
processor_cls=processor_cls, # type: ignore[arg-type]
**_merge_mm_kwargs(model_config, **kwargs),
)
def get_image_processor(
processor_name: str,
*args: Any,
@ -85,6 +137,20 @@ def get_image_processor(
return cast(BaseImageProcessor, processor)
cached_get_image_processor = lru_cache(get_image_processor)
def cached_image_processor_from_config(
model_config: "ModelConfig",
**kwargs: Any,
):
return cached_get_image_processor(
model_config.model,
trust_remote_code=model_config.trust_remote_code,
**_merge_mm_kwargs(model_config, **kwargs),
)
def get_video_processor(
processor_name: str,
*args: Any,
@ -104,3 +170,17 @@ def get_video_processor(
)
return cast(BaseImageProcessor, processor.video_processor)
cached_get_video_processor = lru_cache(get_video_processor)
def cached_video_processor_from_config(
model_config: "ModelConfig",
**kwargs: Any,
):
return cached_get_video_processor(
model_config.model,
trust_remote_code=model_config.trust_remote_code,
**_merge_mm_kwargs(model_config, **kwargs),
)

View File

@ -3,9 +3,10 @@
import contextlib
import os
import warnings
from functools import lru_cache
from pathlib import Path
from types import MethodType
from typing import Optional, Union
from typing import TYPE_CHECKING, Any, Optional, Union
import huggingface_hub
from transformers import (AutoTokenizer, PreTrainedTokenizer,
@ -20,6 +21,9 @@ from vllm.transformers_utils.tokenizers import MistralTokenizer
from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import make_async
if TYPE_CHECKING:
from vllm.config import ModelConfig
logger = init_logger(__name__)
AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
@ -232,6 +236,22 @@ def get_tokenizer(
return tokenizer
cached_get_tokenizer = lru_cache(get_tokenizer)
def cached_tokenizer_from_config(
model_config: "ModelConfig",
**kwargs: Any,
):
return cached_get_tokenizer(
model_config.tokenizer,
tokenizer_mode=model_config.tokenizer_mode,
tokenizer_revision=model_config.tokenizer_revision,
trust_remote_code=model_config.trust_remote_code,
**kwargs,
)
def get_lora_tokenizer(lora_request: LoRARequest, *args,
**kwargs) -> Optional[AnyTokenizer]:
if lora_request is None: