[CI/Build] Sync multimodal tests (#23181)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-08-20 13:06:42 +08:00 committed by GitHub
parent f729023272
commit de7b67a023
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 18 additions and 16 deletions

View File

@ -275,16 +275,17 @@ def _test_processing_correctness_one(
"google/gemma-3n-E2B-it", "google/gemma-3n-E2B-it",
"zai-org/glm-4v-9b", "zai-org/glm-4v-9b",
"zai-org/GLM-4.1V-9B-Thinking", "zai-org/GLM-4.1V-9B-Thinking",
"zai-org/GLM-4.5V",
"ibm-granite/granite-speech-3.3-2b", "ibm-granite/granite-speech-3.3-2b",
"h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-800m",
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
"HuggingFaceM4/Idefics3-8B-Llama3",
"internlm/Intern-S1", "internlm/Intern-S1",
"OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL3-1B", "OpenGVLab/InternVL3-1B",
"HuggingFaceM4/Idefics3-8B-Llama3", "Kwai-Keye/Keye-VL-8B-Preview",
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"moonshotai/Kimi-VL-A3B-Instruct", "moonshotai/Kimi-VL-A3B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
"llava-hf/llava-1.5-7b-hf", "llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf",
@ -315,10 +316,13 @@ def _test_processing_correctness_one(
"Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct",
"Qwen/Qwen2.5-Omni-3B", "Qwen/Qwen2.5-Omni-3B",
"Skywork/Skywork-R1V-38B", "Skywork/Skywork-R1V-38B",
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"stepfun-ai/step3",
"fixie-ai/ultravox-v0_5-llama-3_2-1b", "fixie-ai/ultravox-v0_5-llama-3_2-1b",
"openai/whisper-large-v3", "openai/whisper-large-v3",
"omni-research/Tarsier-7b", "omni-research/Tarsier-7b",
"omni-research/Tarsier2-Recap-7b", "omni-research/Tarsier2-Recap-7b",
"mistralai/Voxtral-Mini-3B-2507",
]) ])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32]) @pytest.mark.parametrize("num_batches", [32])

View File

@ -215,9 +215,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124", "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
trust_remote_code=True, trust_remote_code=True,
is_available_online=False), is_available_online=False),
"HCXVisionForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
trust_remote_code=True),
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b", "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
trust_remote_code=True), trust_remote_code=True),
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
@ -298,8 +295,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
"Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
"Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3", "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3",
trust_remote_code=True, trust_remote_code=True),
is_available_online=False),
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct", "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct",
trust_remote_code=True), trust_remote_code=True),
"TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@ -405,22 +401,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
"Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), # noqa: E501
"Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V", "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V",
is_available_online=False), # noqa: E501 min_transformers_version="4.56"), # noqa: E501
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
trust_remote_code=True, trust_remote_code=True,
extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501
max_transformers_version="4.48", # noqa: E501 max_transformers_version="4.48", # noqa: E501
transformers_version_reason="HF model is not compatible."), # noqa: E501 transformers_version_reason="HF model is not compatible."), # noqa: E501
"HCXVisionForCausalLM": _HfExamplesInfo("naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", # noqa: E501
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, # noqa: E501
min_transformers_version="4.55.1", min_transformers_version="4.55.1",
transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501
"InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
trust_remote_code=True), # noqa: E501
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
extras={"2B": "OpenGVLab/InternVL2-2B", extras={"2B": "OpenGVLab/InternVL2-2B",
"3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501
trust_remote_code=True), trust_remote_code=True),
"InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
trust_remote_code=True),
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
trust_remote_code=True), trust_remote_code=True),
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
@ -464,9 +462,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
transformers_version_reason="HF model is not compatible", # noqa: E501 transformers_version_reason="HF model is not compatible", # noqa: E501
extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B", extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
"1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501
"Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True, "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B",
max_transformers_version="4.53", trust_remote_code=True,
transformers_version_reason="HF model is not compatible"), # noqa: E501 max_transformers_version="4.53",
transformers_version_reason="HF model is not compatible"), # noqa: E501
"PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501 extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
@ -496,8 +495,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
min_transformers_version="4.55.1", min_transformers_version="4.55.1",
transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501 transformers_version_reason="HF model broken in 4.55.0"), # noqa: E501
"Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3", "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3",
trust_remote_code=True, trust_remote_code=True),
is_available_online=False),
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
trust_remote_code=True), trust_remote_code=True),
"TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501 "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501