[Bugfix] Make Gemma3 MM V0 only for now (#14971)

Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
Roger Wang 2025-03-17 10:04:21 -07:00 committed by GitHub
parent c0efdd655b
commit 37e3806132
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 6 additions and 3 deletions

View File

@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
* ✅︎ * ✅︎
* ✅︎ * ✅︎
* ⚠️ *
- * `GLM4VForCausalLM`<sup>^</sup> - * `GLM4VForCausalLM`<sup>^</sup>
* GLM-4V * GLM-4V
* T + I * T + I
@ -948,8 +948,11 @@ V1 currently uses a simplified attention pattern:
- Uses causal attention for all tokens, including image tokens - Uses causal attention for all tokens, including image tokens
- Generates reasonable outputs but does not match the original model's attention for text + image inputs - Generates reasonable outputs but does not match the original model's attention for text + image inputs
- Will be updated in the future to support the correct behavior - Will be updated in the future to support the correct behavior
- Does not support `"do_pan_and_scan": True`
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends. This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
::: :::
:::{note} :::{note}

View File

@ -25,7 +25,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from .interfaces import (MultiModalEmbeddings, SupportsLoRA, from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
SupportsMultiModal, SupportsPP) SupportsMultiModal, SupportsPP, SupportsV0Only)
from .siglip import SiglipVisionModel from .siglip import SiglipVisionModel
from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
maybe_prefix, merge_multimodal_embeddings) maybe_prefix, merge_multimodal_embeddings)
@ -374,7 +374,7 @@ class Gemma3MultiModalProjector(nn.Module):
info=Gemma3ProcessingInfo, info=Gemma3ProcessingInfo,
dummy_inputs=Gemma3DummyInputsBuilder) dummy_inputs=Gemma3DummyInputsBuilder)
class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
SupportsLoRA): SupportsLoRA, SupportsV0Only):
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": [ "qkv_proj": [
"q_proj", "q_proj",