diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 988ad35cdd7e6..a13b6a9225ae5 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -333,6 +333,80 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: ) +# GLM-4.5V +def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData: + model_name = "zai-org/GLM-4.5V" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + mm_processor_kwargs={ + "size": {"shortest_edge": 12544, "longest_edge": 47040000}, + "fps": 1, + }, + limit_mm_per_prompt={modality: 1}, + enforce_eager=True, + tensor_parallel_size=4, + ) + + if modality == "image": + placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + elif modality == "video": + placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + + prompts = [ + ( + "[gMASK]<|system|>\nYou are a helpful assistant.<|user|>\n" + f"{placeholder}" + f"{question}<|assistant|>assistant\n" + ) + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + +# GLM-4.5V-FP8 +def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: + model_name = "zai-org/GLM-4.5V-FP8" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + mm_processor_kwargs={ + "size": {"shortest_edge": 12544, "longest_edge": 47040000}, + "fps": 1, + }, + limit_mm_per_prompt={modality: 1}, + enforce_eager=True, + tensor_parallel_size=4, + ) + + if modality == "image": + placeholder = "<|begin_of_image|><|image|><|end_of_image|>" + elif modality == "video": + placeholder = "<|begin_of_video|><|video|><|end_of_video|>" + + prompts = [ + ( + "[gMASK]<|system|>\nYou are a helpful assistant.<|user|>\n" + f"{placeholder}" + f"{question}<|assistant|>assistant\n" + ) + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # H2OVL-Mississippi def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -383,8 +457,8 @@ def run_hyperclovax_seed_vision( for question in questions: if modality == "image": """ - ocr: List the words in the image in raster order. - Even if the word order feels unnatural for reading, + ocr: List the words in the image in raster order. + Even if the word order feels unnatural for reading, the model will handle it as long as it follows raster order. e.g. "Naver, CLOVA, bigshane" lens_keywords: List the entity names in the image. @@ -1448,6 +1522,8 @@ model_example_map = { "gemma3n": run_gemma3n, "glm4v": run_glm4v, "glm4_1v": run_glm4_1v, + "glm4_5v": run_glm4_5v, + "glm4_5v_fp8": run_glm4_5v_fp8, "h2ovl_chat": run_h2ovl, "hyperclovax_seed_vision": run_hyperclovax_seed_vision, "idefics3": run_idefics3, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 799337ed68503..56519c95f822f 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1064,6 +1064,76 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData: ) +# GLM-4.5V +def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "zai-org/GLM-4.5V" + + engine_args = EngineArgs( + model=model_name, + max_model_len=32768, + max_num_seqs=2, + limit_mm_per_prompt={"image": len(image_urls)}, + enforce_eager=True, + tensor_parallel_size=4, + ) + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + processor = AutoProcessor.from_pretrained(model_name) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + +# GLM-4.5V-FP8 +def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "zai-org/GLM-4.5V-FP8" + + engine_args = EngineArgs( + model=model_name, + max_model_len=32768, + max_num_seqs=2, + limit_mm_per_prompt={"image": len(image_urls)}, + enforce_eager=True, + tensor_parallel_size=4, + ) + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + processor = AutoProcessor.from_pretrained(model_name) + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + model_example_map = { "aria": load_aria, "aya_vision": load_aya_vision, @@ -1096,6 +1166,8 @@ model_example_map = { "step3": load_step3, "tarsier": load_tarsier, "tarsier2": load_tarsier2, + "glm4_5v": load_glm4_5v, + "glm4_5v_fp8": load_glm4_5v_fp8, } diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 015577322ffe3..08252c51310be 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -126,7 +126,7 @@ class Glm4vVideoPixelInputs(TensorSchema): - ctpp: Number of channels * temporal_patch_size * patch_size * patch_size - f: Number of frames - - g: Grid dimensions (3 for grid_t which is usually 1 for processed + - g: Grid dimensions (3 for grid_t which is usually 1 for processed video, grid_h, grid_w) """ type: Literal["pixel_values_videos"] = "pixel_values_videos" @@ -141,7 +141,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema): - p: Number of video patches across all frames - h: Hidden size (must match language model backbone) - f: Number of frames - - g: Grid dimensions (3 for grid_t which is usually 1 for processed + - g: Grid dimensions (3 for grid_t which is usually 1 for processed video, grid_h, grid_w) """ type: Literal["video_embeds"] = "video_embeds" @@ -234,7 +234,8 @@ class Glm4vVisionAttention(nn.Module): total_num_kv_heads=num_heads, bias=False, quant_config=quant_config, - prefix=f"{prefix}.qkv", + # Change qkv prefix to align with GLM-4.5V-FP8 quantization config + prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv", ) self.proj = RowParallelLinear( input_size=projection_size,