From 37e380613220f607cb465fc15f72a4a033a98b23 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 17 Mar 2025 10:04:21 -0700
Subject: [PATCH] [Bugfix] Make Gemma3 MM V0 only for now (#14971)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md  | 5 ++++-
 vllm/model_executor/models/gemma3_mm.py | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 3d42d5f6b529e..2d7617d9ebab9 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
   * ✅︎
   * ✅︎
-  * ⚠️
+  *
 - * `GLM4VForCausalLM`<sup>^</sup>
   * GLM-4V
   * T + I
@@ -948,8 +948,11 @@ V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
 - Generates reasonable outputs but does not match the original model's attention for text + image inputs
 - Will be updated in the future to support the correct behavior
+- Does not support `"do_pan_and_scan": True`
 
 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
+
+For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
 :::
 
 :::{note}
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index b945e4732a507..27b254b9c5c84 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -25,7 +25,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
+                         SupportsMultiModal, SupportsPP, SupportsV0Only)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -374,7 +374,7 @@ class Gemma3MultiModalProjector(nn.Module):
                                         info=Gemma3ProcessingInfo,
                                         dummy_inputs=Gemma3DummyInputsBuilder)
 class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
-                                     SupportsLoRA):
+                                     SupportsLoRA, SupportsV0Only):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",