From fe2016de2d2d68bb967e9c43ee168db01322b0f1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 23 Oct 2025 22:42:40 +0800 Subject: [PATCH] [CI/Build] Remove unnecessary flags from test registry (#27353) Signed-off-by: DarkLight1337 --- docs/models/supported_models.md | 4 +- tests/distributed/test_pipeline_parallel.py | 12 ++- tests/distributed/test_sequence_parallel.py | 12 ++- .../entrypoints/openai/test_chat_template.py | 4 +- tests/entrypoints/test_chat_utils.py | 16 ++- .../multimodal/generation/vlm_utils/core.py | 5 +- .../multimodal/processing/test_common.py | 4 +- .../processing/test_tensor_schema.py | 4 +- tests/models/multimodal/test_mapping.py | 4 +- tests/models/registry.py | 101 ++++++------------ tests/models/test_initialization.py | 10 +- tests/models/utils.py | 4 +- vllm/model_executor/models/glm4_1v.py | 32 +----- 13 files changed, 89 insertions(+), 123 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 4f53a2be4382..2bffb28d3193 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -374,8 +374,8 @@ th { | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | -| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ | -| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | +| `HunYuanDenseV1ForCausalLM` | Hunyuan Dense | `tencent/Hunyuan-7B-Instruct` | ✅︎ | ✅︎ | +| `HunYuanMoEV1ForCausalLM` | Hunyuan-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 8ee0d12df640..0ab94d30858f 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -244,7 +244,7 @@ def _compare_tp( tokenizer_mode = model_info.tokenizer_mode hf_overrides = model_info.hf_overrides hf_config = get_config(model_id, trust_remote_code) - skip_tokenizer_init = model_info.skip_tokenizer_init + require_embed_inputs = model_info.require_embed_inputs max_num_seqs = model_info.max_num_seqs dtype = "float16" @@ -299,8 +299,14 @@ def _compare_tp( common_args.extend(["--load-format", load_format]) if hf_overrides: common_args.extend(["--hf-overrides", json.dumps(hf_overrides)]) - if skip_tokenizer_init: - common_args.append("--skip-tokenizer-init") + if require_embed_inputs: + common_args.extend( + [ + "--skip-tokenizer-init", + "--enable-prompt-embeds", + "--enable-mm-embeds", + ] + ) if max_num_seqs: common_args.extend(["--max-num-seqs", f"{max_num_seqs}"]) diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index 3646f48426b6..94b2b51211a6 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -181,7 +181,7 @@ def _compare_sp( trust_remote_code = model_info.trust_remote_code tokenizer_mode = model_info.tokenizer_mode hf_overrides = model_info.hf_overrides - skip_tokenizer_init = model_info.skip_tokenizer_init + require_embed_inputs = model_info.require_embed_inputs if load_format == "dummy": # Avoid OOM @@ -233,8 +233,14 @@ def _compare_sp( common_args.extend(["--load-format", load_format]) if hf_overrides: common_args.extend(["--hf-overrides", json.dumps(hf_overrides)]) - if skip_tokenizer_init: - common_args.append("--skip-tokenizer-init") + if require_embed_inputs: + common_args.extend( + [ + "--skip-tokenizer-init", + "--enable-prompt-embeds", + "--enable-mm-embeds", + ] + ) compilation_config = { "mode": CompilationMode.VLLM_COMPILE, diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index d1202a59752b..ee79ed59c410 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -114,7 +114,9 @@ def test_get_gen_prompt( trust_remote_code=model_info.trust_remote_code, revision=model_info.revision, hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.skip_tokenizer_init, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index f5faa7a3507c..378c2624f7d9 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -1742,7 +1742,9 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.skip_tokenizer_init, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) @@ -1842,7 +1844,9 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.skip_tokenizer_init, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) @@ -1903,7 +1907,9 @@ def test_resolve_content_format_hf_defined(model, expected_format): revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.skip_tokenizer_init, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) @@ -1961,7 +1967,9 @@ def test_resolve_content_format_fallbacks(model, expected_format): revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.skip_tokenizer_init, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index 8d0e9b3eee9f..03ff3bcf6307 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -71,8 +71,9 @@ def run_test( vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode if model_info.hf_overrides: vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides - if model_info.skip_tokenizer_init: - vllm_runner_kwargs_["skip_tokenizer_init"] = model_info.skip_tokenizer_init + if model_info.require_embed_inputs: + for k in ("skip_tokenizer_init", "enable_prompt_embeds", "enable_mm_embeds"): + vllm_runner_kwargs_[k] = model_info.require_embed_inputs if vllm_runner_kwargs: vllm_runner_kwargs_.update(vllm_runner_kwargs) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index d0f730630d7f..3ca7c0a2c899 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -108,7 +108,9 @@ def _test_processing_correctness( hf_overrides=model_info.hf_overrides, # Ensure that the cache can fit all of the data mm_processor_cache_gb=2048, - skip_tokenizer_init=model_info.skip_tokenizer_init, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 024fe76e6ff7..6f4ddb426cd5 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -218,7 +218,9 @@ def test_model_tensor_schema(model_arch: str, model_id: str): revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=hf_overrides_fn, - skip_tokenizer_init=model_info.skip_tokenizer_init, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py index 2179cf33a573..2f38dc450ef9 100644 --- a/tests/models/multimodal/test_mapping.py +++ b/tests/models/multimodal/test_mapping.py @@ -59,7 +59,9 @@ def test_hf_model_weights_mapper(model_arch: str): revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.skip_tokenizer_init, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, dtype=model_info.dtype, ) diff --git a/tests/models/registry.py b/tests/models/registry.py index bd5a4650081f..f5072ef7cbad 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -6,7 +6,6 @@ from dataclasses import dataclass, field from typing import Any, Literal import pytest -import torch from packaging.version import Version from transformers import __version__ as TRANSFORMERS_VERSION @@ -33,6 +32,11 @@ class _HfExamplesInfo: for speculative decoding. """ + speculative_method: str | None = None + """ + The method to use for speculative decoding. + """ + min_transformers_version: str | None = None """ The minimum version of HF Transformers that is required to run this model. @@ -48,9 +52,10 @@ class _HfExamplesInfo: The reason for the minimum/maximum version requirement. """ - skip_tokenizer_init: bool = False + require_embed_inputs: bool = False """ - If true, skip initialization of tokenizer and detokenizer. + If `True`, enables prompt and multi-modal embedding inputs while + disabling tokenization. """ dtype: ModelDType = "auto" @@ -168,10 +173,7 @@ class _HfExamplesInfo: _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] - "ApertusForCausalLM": _HfExamplesInfo( - "swiss-ai/Apertus-8B-Instruct-2509", - min_transformers_version="4.56.0", - ), + "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"), "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True), "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True), "ArceeForCausalLM": _HfExamplesInfo("arcee-ai/AFM-4.5B-Base"), @@ -192,7 +194,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ), "BambaForCausalLM": _HfExamplesInfo( "ibm-ai-platform/Bamba-9B-v1", - min_transformers_version="4.55.3", extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}, ), "BloomForCausalLM": _HfExamplesInfo( @@ -212,11 +213,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "CohereForAI/c4ai-command-r7b-12-2024", trust_remote_code=True, ), - "CwmForCausalLM": _HfExamplesInfo( - "facebook/cwm", - trust_remote_code=True, - is_available_online=False, - ), + "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"), "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"), "DeciLMForCausalLM": _HfExamplesInfo( "nvidia/Llama-3_3-Nemotron-Super-49B-v1", @@ -232,18 +229,12 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True, ), "DeepseekV32ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3.2-Exp"), - "Ernie4_5ForCausalLM": _HfExamplesInfo( - "baidu/ERNIE-4.5-0.3B-PT", min_transformers_version="4.54" - ), - "Ernie4_5_MoeForCausalLM": _HfExamplesInfo( - "baidu/ERNIE-4.5-21B-A3B-PT", min_transformers_version="4.54" - ), + "Ernie4_5ForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-0.3B-PT"), + "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT"), "ExaoneForCausalLM": _HfExamplesInfo( "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", trust_remote_code=True ), - "Exaone4ForCausalLM": _HfExamplesInfo( - "LGAI-EXAONE/EXAONE-4.0-32B", min_transformers_version="4.54" - ), + "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"), "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "FalconH1ForCausalLM": _HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base"), @@ -251,14 +242,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"), - "Gemma3nForCausalLM": _HfExamplesInfo( - "google/gemma-3n-E2B-it", min_transformers_version="4.53" - ), + "Gemma3nForCausalLM": _HfExamplesInfo("google/gemma-3n-E2B-it"), "GlmForCausalLM": _HfExamplesInfo("zai-org/glm-4-9b-chat-hf"), "Glm4ForCausalLM": _HfExamplesInfo("zai-org/GLM-4-9B-0414"), - "Glm4MoeForCausalLM": _HfExamplesInfo( - "zai-org/GLM-4.5", min_transformers_version="4.54" - ), + "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5"), "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo( "bigcode/starcoder", @@ -266,8 +253,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "tiny": "bigcode/tiny_starcoder_py", "santacoder": "bigcode/gpt_bigcode-santacoder", }, - min_transformers_version="4.55.1", - transformers_version_reason="HF model broken in 4.55.0", ), "GPTJForCausalLM": _HfExamplesInfo( "Milos/slovak-gpt-j-405M", {"6b": "EleutherAI/gpt-j-6b"} @@ -279,8 +264,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"), "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"), "GraniteMoeHybridForCausalLM": _HfExamplesInfo( - "ibm-granite/granite-4.0-tiny-preview", - min_transformers_version="4.55.3", + "ibm-granite/granite-4.0-tiny-preview" ), "GraniteMoeSharedForCausalLM": _HfExamplesInfo( "ibm-research/moe-7b-1b-active-shared-experts" @@ -288,15 +272,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Grok1ModelForCausalLM": _HfExamplesInfo( "hpcai-tech/grok-1", trust_remote_code=True ), + "HunYuanDenseV1ForCausalLM": _HfExamplesInfo("tencent/Hunyuan-7B-Instruct"), "HunYuanMoEV1ForCausalLM": _HfExamplesInfo( "tencent/Hunyuan-A13B-Instruct", trust_remote_code=True ), - # TODO: Remove is_available_online once their config.json is fixed - "HunYuanDenseV1ForCausalLM": _HfExamplesInfo( - "tencent/Hunyuan-7B-Instruct-0124", - trust_remote_code=True, - is_available_online=False, - ), "InternLMForCausalLM": _HfExamplesInfo( "internlm/internlm-chat-7b", trust_remote_code=True ), @@ -312,15 +291,12 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo( "ai21labs/AI21-Jamba-1.5-Mini", - min_transformers_version="4.55.3", extras={ "tiny": "ai21labs/Jamba-tiny-dev", "random": "ai21labs/Jamba-tiny-random", }, ), - "Lfm2ForCausalLM": _HfExamplesInfo( - "LiquidAI/LFM2-1.2B", min_transformers_version="4.54" - ), + "Lfm2ForCausalLM": _HfExamplesInfo("LiquidAI/LFM2-1.2B"), "Lfm2MoeForCausalLM": _HfExamplesInfo( "LiquidAI/LFM2-8B-A1B", min_transformers_version="4.58" ), @@ -338,7 +314,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ), "Llama4ForCausalLM": _HfExamplesInfo( "meta-llama/Llama-4-Scout-17B-16E-Instruct", - is_available_online=False, ), "LongcatFlashForCausalLM": _HfExamplesInfo( "meituan-longcat/LongCat-Flash-Chat", trust_remote_code=True @@ -346,7 +321,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), "Mamba2ForCausalLM": _HfExamplesInfo( "mistralai/Mamba-Codestral-7B-v0.1", - min_transformers_version="4.55.3", extras={ "random": "yujiepan/mamba2-codestral-v0.1-tiny-random", }, @@ -421,7 +395,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "SeedOssForCausalLM": _HfExamplesInfo( "ByteDance-Seed/Seed-OSS-36B-Instruct", trust_remote_code=True, - is_available_online=False, ), "SmolLM3ForCausalLM": _HfExamplesInfo("HuggingFaceTB/SmolLM3-3B"), "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b"), @@ -488,7 +461,8 @@ _EMBEDDING_EXAMPLE_MODELS = { "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"), "BertSpladeSparseEmbeddingModel": _HfExamplesInfo( - "naver/splade-v3", is_available_online=False + "naver/splade-v3", + hf_overrides={"architectures": ["BertSpladeSparseEmbeddingModel"]}, ), # [Multimodal] "CLIPModel": _HfExamplesInfo("openai/clip-vit-base-patch32"), @@ -499,18 +473,17 @@ _EMBEDDING_EXAMPLE_MODELS = { "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), "PrithviGeoSpatialMAE": _HfExamplesInfo( "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", - dtype=torch.float16, + dtype="float16", enforce_eager=True, - skip_tokenizer_init=True, - # This is to avoid the model - # going OOM in CI + require_embed_inputs=True, + # This is to avoid the model going OOM in CI max_num_seqs=32, ), "Terratorch": _HfExamplesInfo( "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", - dtype=torch.float16, + dtype="float16", enforce_eager=True, - skip_tokenizer_init=True, + require_embed_inputs=True, # This is to avoid the model going OOM in CI max_num_seqs=32, ), @@ -598,10 +571,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { ), "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"), - "Gemma3nForConditionalGeneration": _HfExamplesInfo( - "google/gemma-3n-E2B-it", - min_transformers_version="4.53", - ), + "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it"), "GraniteSpeechForConditionalGeneration": _HfExamplesInfo( "ibm-granite/granite-speech-3.3-2b" ), @@ -611,9 +581,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { hf_overrides={"architectures": ["GLM4VForCausalLM"]}, ), "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"), - "Glm4vMoeForConditionalGeneration": _HfExamplesInfo( - "zai-org/GLM-4.5V", min_transformers_version="4.56" - ), + "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V"), "H2OVLChatModel": _HfExamplesInfo( "h2oai/h2ovl-mississippi-800m", trust_remote_code=True, @@ -627,9 +595,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { ), "Idefics3ForConditionalGeneration": _HfExamplesInfo( "HuggingFaceM4/Idefics3-8B-Llama3", - {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, - min_transformers_version="4.56", - transformers_version_reason="HF model broken in 4.55", + extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, ), "InternS1ForConditionalGeneration": _HfExamplesInfo( "internlm/Intern-S1", trust_remote_code=True @@ -781,13 +747,11 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen/Qwen3-VL-4B-Instruct", max_model_len=4096, min_transformers_version="4.57", - is_available_online=False, ), "Qwen3VLMoeForConditionalGeneration": _HfExamplesInfo( "Qwen/Qwen3-VL-30B-A3B-Instruct", max_model_len=4096, min_transformers_version="4.57", - is_available_online=False, ), "Qwen3OmniMoeForConditionalGeneration": _HfExamplesInfo( "Qwen/Qwen3-Omni-30B-A3B-Instruct", @@ -799,9 +763,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Skywork/Skywork-R1V-38B", trust_remote_code=True ), "SmolVLMForConditionalGeneration": _HfExamplesInfo( - "HuggingFaceTB/SmolVLM2-2.2B-Instruct", - min_transformers_version="4.56", - transformers_version_reason="HF model broken in 4.55", + "HuggingFaceTB/SmolVLM2-2.2B-Instruct" ), "Step3VLForConditionalGeneration": _HfExamplesInfo( "stepfun-ai/step3", trust_remote_code=True @@ -817,7 +779,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { ), "VoxtralForConditionalGeneration": _HfExamplesInfo( "mistralai/Voxtral-Mini-3B-2507", - min_transformers_version="4.54", # disable this temporarily until we support HF format is_available_online=False, ), @@ -878,8 +839,8 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "EagleMiniCPMForCausalLM": _HfExamplesInfo( "openbmb/MiniCPM-1B-sft-bf16", trust_remote_code=True, - is_available_online=False, speculative_model="openbmb/MiniCPM-2B-sft-bf16", + speculative_method="eagle", tokenizer="openbmb/MiniCPM-2B-sft-bf16", ), "ErnieMTPModel": _HfExamplesInfo( @@ -890,8 +851,6 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "Glm4MoeMTPModel": _HfExamplesInfo( "zai-org/GLM-4.5", speculative_model="zai-org/GLM-4.5", - min_transformers_version="4.56", - is_available_online=False, ), "LongCatFlashMTPModel": _HfExamplesInfo( "meituan-longcat/LongCat-Flash-Chat", diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 9edab1ad67fc..48a6f34366cf 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -105,20 +105,19 @@ def can_initialize( if model_arch == "WhisperForConditionalGeneration": m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - extra_args = {} - if model_arch in ("PrithviGeoSpatialMAE", "Terratorch"): - extra_args["enable_mm_embeds"] = True - LLM( model_info.default, tokenizer=model_info.tokenizer, tokenizer_mode=model_info.tokenizer_mode, revision=model_info.revision, enforce_eager=model_info.enforce_eager, - skip_tokenizer_init=model_info.skip_tokenizer_init, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, dtype=model_info.dtype, speculative_config={ "model": model_info.speculative_model, + "method": model_info.speculative_method, "num_speculative_tokens": 1, } if model_info.speculative_model @@ -133,7 +132,6 @@ def can_initialize( else "vllm", hf_overrides=hf_overrides_fn, max_num_seqs=model_info.max_num_seqs, - **extra_args, ) diff --git a/tests/models/utils.py b/tests/models/utils.py index ffdb6950678c..9843887a1320 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -309,7 +309,9 @@ def build_model_context( limit_mm_per_prompt=limit_mm_per_prompt, mm_processor_cache_gb=mm_processor_cache_gb, hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.skip_tokenizer_init, + skip_tokenizer_init=model_info.require_embed_inputs, + enable_prompt_embeds=model_info.require_embed_inputs, + enable_mm_embeds=model_info.require_embed_inputs, enforce_eager=model_info.enforce_eager, **model_config_kwargs, ) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index bed7c81335e0..85f489837701 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -36,9 +36,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from packaging.version import Version from transformers import BatchFeature -from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig from transformers.models.glm4v.image_processing_glm4v import ( Glm4vImageProcessor, @@ -1270,14 +1268,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): video_mm_data = dict() video_mm_data["videos"] = [[video_array]] - # backward compatibility for Transformers 4.55 unuse_metadata = ["do_sample_frames"] - if ( - not hasattr(VideoMetadata, "frames_indices") - and "frames_indices" in metadata - ): - unuse_metadata.append("frames_indices") - video_mm_data["video_metadata"] = [ [ VideoMetadata( @@ -1296,24 +1287,11 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): mm_kwargs=video_mm_kwargs, tok_kwargs=tok_kwargs, ) - if not video_mm_kwargs["do_sample_frames"] and Version( - TRANSFORMERS_VERSION - ) < Version("4.56.0"): - # Transformers v4.55 has incorrect timestamps issue for - # skip sampling. We construct the placeholder manually to - # get placeholders with correct timestamps. - placeholder = self.info._construct_video_placeholder( - video_array, - metadata, - video_outputs["video_grid_thw"].squeeze(0), - ) - video_placeholder = processor.tokenizer.decode(placeholder) - else: - input_ids = video_outputs.pop("input_ids") - input_ids[input_ids == processor.image_token_id] = ( - processor.video_token_id - ) - video_placeholder = processor.tokenizer.batch_decode(input_ids)[0] + input_ids = video_outputs.pop("input_ids") + input_ids[input_ids == processor.image_token_id] = ( + processor.video_token_id + ) + video_placeholder = processor.tokenizer.batch_decode(input_ids)[0] prompt = prompt.replace( "<|begin_of_video|><|video|><|end_of_video|>", video_placeholder,