From 37e380613220f607cb465fc15f72a4a033a98b23 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Mon, 17 Mar 2025 10:04:21 -0700 Subject: [PATCH] [Bugfix] Make Gemma3 MM V0 only for now (#14971) Signed-off-by: Roger Wang --- docs/source/models/supported_models.md | 5 ++++- vllm/model_executor/models/gemma3_mm.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 3d42d5f6b529e..2d7617d9ebab9 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. * ✅︎ * ✅︎ - * ⚠️ + * - * `GLM4VForCausalLM`^ * GLM-4V * T + I @@ -948,8 +948,11 @@ V1 currently uses a simplified attention pattern: - Uses causal attention for all tokens, including image tokens - Generates reasonable outputs but does not match the original model's attention for text + image inputs - Will be updated in the future to support the correct behavior +- Does not support `"do_pan_and_scan": True` This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. + +For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment. ::: :::{note} diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index b945e4732a507..27b254b9c5c84 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -25,7 +25,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from .interfaces import (MultiModalEmbeddings, SupportsLoRA, - SupportsMultiModal, SupportsPP) + SupportsMultiModal, SupportsPP, SupportsV0Only) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -374,7 +374,7 @@ class Gemma3MultiModalProjector(nn.Module): info=Gemma3ProcessingInfo, dummy_inputs=Gemma3DummyInputsBuilder) class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, - SupportsLoRA): + SupportsLoRA, SupportsV0Only): packed_modules_mapping = { "qkv_proj": [ "q_proj",