From c07a73317d202c2dad67f12893fcddb6d3664950 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Thu, 28 Aug 2025 14:51:24 +0800
Subject: [PATCH] [CI] enable idefics3 and fuyu-8b test in multimodal test
 (#23790)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 .../multimodal/generation/test_common.py      | 36 ++++++++-----------
 1 file changed, 15 insertions(+), 21 deletions(-)
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 2b60faae8ec0b..d61b182761e44 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -189,23 +189,21 @@ VLM_TEST_SETTINGS = {
         },
         marks=[pytest.mark.core_model],
     ),
-    # FIXME(Isotr0py): Enable this test after
-    # https://github.com/huggingface/transformers/pull/39470 released
-    # "idefics3-transformers": VLMTestInfo(
-    #     models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
-    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-    #     prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
-    #     img_idx_to_prompt=lambda idx: "<image>",
-    #     max_model_len=8192,
-    #     max_num_seqs=2,
-    #     auto_cls=AutoModelForImageTextToText,
-    #     hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
-    #     image_size_factors=[(0.25, 0.5, 1.0)],
-    #     vllm_runner_kwargs={
-    #         "model_impl": "transformers",
-    #     },
-    #     marks=[pytest.mark.core_model],
-    # ),
+    "idefics3-transformers": VLMTestInfo(
+        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[pytest.mark.core_model],
+    ),
     # Pixel values from processor are not 4D or 5D arrays
     "qwen2_5_vl-transformers": VLMTestInfo(
         models=["Qwen/Qwen2.5-VL-3B-Instruct"],
@@ -322,10 +320,6 @@ VLM_TEST_SETTINGS = {
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        # FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we
-        # should enable this again after the fix is released:
-        # https://github.com/huggingface/transformers/pull/39915
-        marks=[pytest.mark.skip("HF model is broken")],
     ),
     "gemma3": VLMTestInfo(
         models=["google/gemma-3-4b-it"],