diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index a3908e30ec6e3..0996bcf60aa1c 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -65,7 +65,7 @@ from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, - PromptUpdate) + PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors @@ -1213,7 +1213,10 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): placeholder.append(eoi_token_id) placeholder.extend(frame_idx) placeholder.append(eov_token_id) - return placeholder + return PromptUpdateDetails.select_token_id( + placeholder, + embed_token_id=hf_processor.video_token_id, + ) return [ PromptReplacement(