diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bd7a57b436213..c058c20f1ed73 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -591,7 +591,8 @@ See [this page](generative_models.md) for more information on how to use generat | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4MoeForCausalLM` | GLM-4.5 | T + IE+ + VE+ | `THUDM/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4MoeForCausalLM` | GLM-4.5 | T + IE+ + VE+ | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V-Air`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index fdc7888c85efb..d88d77cddcca5 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -377,9 +377,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b", trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 - "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"), # noqa: E501 - "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4.5", - min_transformers_version="4.54", + "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking"), # noqa: E501 + "Glm4MoeForCausalLM": _HfExamplesInfo("zai-org/GLM-4.5", + min_transformers_version="4.54"), # noqa: E501 + "Glm4v_moeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V-Air", is_available_online=False), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 @@ -515,8 +516,8 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { is_available_online=False, speculative_model="openbmb/MiniCPM-2B-sft-bf16", tokenizer="openbmb/MiniCPM-2B-sft-bf16"), - "Glm4MoeMTPModel": _HfExamplesInfo("THUDM/GLM-4.5", - speculative_model="THUDM/GLM-4.5", + "Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5", + speculative_model="zai-org/GLM-4.5", min_transformers_version="4.54", is_available_online=False), "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py index 478f4b9166725..91913c933184e 100644 --- a/tests/tool_use/test_glm4_moe_tool_parser.py +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -12,7 +12,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer pytest.skip("skip glm4_moe parser test", allow_module_level=True) # Use a common model that is likely to be available -MODEL = "THUDM/GLM-4.5" +MODEL = "zai-org/GLM-4.5" @pytest.fixture(scope="module") diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index dddd4d6a71170..24dd86620fe91 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -1096,7 +1096,7 @@ class MRotaryEmbedding(RotaryEmbedding): audio_feature_lengths=audio_feature_lengths, use_audio_in_video=use_audio_in_video, ) - elif "glm4v" in hf_config.model_type: + elif hf_config.model_type in ["glm4v", "glm4v_moe"]: return cls._glm4v_get_input_positions_tensor( input_tokens=input_tokens, hf_config=hf_config, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 7c9840790fe3e..7983895687a38 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -37,8 +37,7 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange from transformers import BatchFeature -from transformers.models.glm4v.configuration_glm4v import (Glm4vConfig, - Glm4vVisionConfig) +from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig from transformers.models.glm4v.image_processing_glm4v import ( Glm4vImageProcessor, smart_resize) from transformers.models.glm4v.video_processing_glm4v import ( @@ -801,7 +800,7 @@ class Glm4vVisionTransformer(nn.Module): class Glm4vProcessingInfo(BaseProcessingInfo): def get_hf_config(self): - return self.ctx.get_hf_config(Glm4vConfig) + return self.ctx.get_hf_config() def get_tokenizer(self): return self.ctx.tokenizer @@ -1253,7 +1252,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config: Glm4vConfig = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config @@ -1267,12 +1266,18 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, prefix=maybe_prefix(prefix, "visual"), ) + if config.model_type == "glm4v": + architectures = ["Glm4ForCausalLM"] + elif config.model_type == "glm4v_moe": + architectures = ["Glm4MoeForCausalLM"] + else: + architectures = None + self.language_model = init_vllm_registered_model( vllm_config=vllm_config, - prefix=maybe_prefix(prefix, ""), - architectures=["Glm4ForCausalLM"], - hf_config=self.config.get_text_config(), - ) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=architectures) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 0c5d87a7dc472..9b6ab52d86805 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -206,6 +206,7 @@ _MULTIMODAL_MODELS = { "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501 "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 + "Glm4v_moeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"), # noqa: E501 "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501 "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"),