mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 05:55:55 +08:00
[Bugfix] Fix max image feature size for Llava-one-vision (#12104)
Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
parent
92e793d91a
commit
874f7c292a
@ -13,6 +13,67 @@ from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
def _validate_image_max_tokens_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
max_tokens: int,
|
||||
failed_size_excs: list[tuple[ImageSize, Exception]],
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
info = processor.info
|
||||
feature_size = info.get_num_image_tokens(image_width=image_size.width,
|
||||
image_height=image_size.height)
|
||||
|
||||
try:
|
||||
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 5 minutes to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
def test_processor_max_tokens(model_id):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
|
||||
)
|
||||
info = processor.info
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
# The aspect ratio of the grid layout is between 1 and 2
|
||||
# NOTE: Assumes that feature size calculation is the same if we
|
||||
# swap the width and height of the image
|
||||
for w, h in itertools.product(range(32, 4096), repeat=2):
|
||||
aspect_ratio = w / h
|
||||
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
|
||||
image_sizes.append(ImageSize(w, h))
|
||||
seen_aspect_ratios.add(aspect_ratio)
|
||||
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
validate_one = partial(
|
||||
_validate_image_max_tokens_one,
|
||||
processor,
|
||||
info.get_max_image_tokens(), # type: ignore
|
||||
failed_size_excs,
|
||||
)
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
def _validate_image_prompt_replacements_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
num_imgs: int,
|
||||
|
||||
@ -13,6 +13,68 @@ from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
def _validate_image_max_tokens_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
max_tokens: int,
|
||||
failed_size_excs: list[tuple[ImageSize, Exception]],
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
info = processor.info
|
||||
feature_size = info.get_num_image_tokens(image_width=image_size.width,
|
||||
image_height=image_size.height)
|
||||
|
||||
try:
|
||||
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 5 minutes to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
def test_processor_max_tokens(model_id):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
|
||||
)
|
||||
info = processor.info
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
# The aspect ratio of the grid layout is between 1 and 6
|
||||
# NOTE: Assumes that feature size calculation is the same if we
|
||||
# swap the width and height of the image
|
||||
for w, h in itertools.product(range(32, 4096), repeat=2):
|
||||
aspect_ratio = w / h
|
||||
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
|
||||
image_sizes.append(ImageSize(w, h))
|
||||
seen_aspect_ratios.add(aspect_ratio)
|
||||
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
validate_one = partial(
|
||||
_validate_image_max_tokens_one,
|
||||
processor,
|
||||
info.get_max_image_tokens(), # type: ignore
|
||||
failed_size_excs,
|
||||
)
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
def _validate_image_prompt_replacements_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
num_imgs: int,
|
||||
|
||||
@ -19,8 +19,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
|
||||
VideoProcessorItems)
|
||||
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
|
||||
VideoEmbeddingItems, VideoProcessorItems)
|
||||
from vllm.multimodal.processing import PromptReplacement
|
||||
from vllm.multimodal.profiling import ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
@ -145,6 +145,10 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
|
||||
|
||||
return (unpadded_features, newline_features)
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
# NOTE: This hardcoded value is found via processor tests
|
||||
return ImageSize(width=1153, height=944)
|
||||
|
||||
def _get_num_frame_tokens(
|
||||
self,
|
||||
*,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user