Fix GLM-4.5V-FP8 numerical issue (#22949)

Signed-off-by: qizixi <qizixi@meta.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2026-05-24 21:24:34 +08:00 · 2025-08-19 16:56:31 +09:00 · 2025-08-19 16:56:31 +09:00 · 4efd43e9b4
commit 4efd43e9b4
parent 3c8a787247
3 changed files with 154 additions and 5 deletions
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -333,6 +333,80 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
    )
 # GLM-4.5V
 def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        mm_processor_kwargs={
            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
            "fps": 1,
        },
        limit_mm_per_prompt={modality: 1},
        enforce_eager=True,
        tensor_parallel_size=4,
    )
    if modality == "image":
        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
    elif modality == "video":
        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
    prompts = [
        (
            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
            f"{placeholder}"
            f"{question}<|assistant|>assistant\n"
        )
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # GLM-4.5V-FP8
 def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V-FP8"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        mm_processor_kwargs={
            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
            "fps": 1,
        },
        limit_mm_per_prompt={modality: 1},
        enforce_eager=True,
        tensor_parallel_size=4,
    )
    if modality == "image":
        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
    elif modality == "video":
        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
    prompts = [
        (
            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
            f"{placeholder}"
            f"{question}<|assistant|>assistant\n"
        )
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )
 # H2OVL-Mississippi
 def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -383,8 +457,8 @@ def run_hyperclovax_seed_vision(
    for question in questions:
        if modality == "image":
            """
-            ocr: List the words in the image in raster order. 
+            ocr: List the words in the image in raster order.
-                Even if the word order feels unnatural for reading, 
+                Even if the word order feels unnatural for reading,
                the model will handle it as long as it follows raster order.
                e.g. "Naver, CLOVA, bigshane"
            lens_keywords: List the entity names in the image.
@ -1448,6 +1522,8 @@ model_example_map = {
    "gemma3n": run_gemma3n,
    "glm4v": run_glm4v,
    "glm4_1v": run_glm4_1v,
    "glm4_5v": run_glm4_5v,
    "glm4_5v_fp8": run_glm4_5v_fp8,
    "h2ovl_chat": run_h2ovl,
    "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
    "idefics3": run_idefics3,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -1064,6 +1064,76 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
    )
 # GLM-4.5V
 def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enforce_eager=True,
        tensor_parallel_size=4,
    )
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
    processor = AutoProcessor.from_pretrained(model_name)
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_data = [fetch_image(url) for url in image_urls]
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )
 # GLM-4.5V-FP8
 def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V-FP8"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enforce_eager=True,
        tensor_parallel_size=4,
    )
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
    processor = AutoProcessor.from_pretrained(model_name)
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_data = [fetch_image(url) for url in image_urls]
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )
 model_example_map = {
    "aria": load_aria,
    "aya_vision": load_aya_vision,
@ -1096,6 +1166,8 @@ model_example_map = {
    "step3": load_step3,
    "tarsier": load_tarsier,
    "tarsier2": load_tarsier2,
    "glm4_5v": load_glm4_5v,
    "glm4_5v_fp8": load_glm4_5v_fp8,
 }
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@ -126,7 +126,7 @@ class Glm4vVideoPixelInputs(TensorSchema):
        - ctpp: Number of channels * temporal_patch_size *
            patch_size * patch_size
        - f: Number of frames
-        - g: Grid dimensions (3 for grid_t which is usually 1 for processed 
+        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
          video, grid_h, grid_w)
    """
    type: Literal["pixel_values_videos"] = "pixel_values_videos"
@ -141,7 +141,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema):
        - p: Number of video patches across all frames
        - h: Hidden size (must match language model backbone)
        - f: Number of frames
-        - g: Grid dimensions (3 for grid_t which is usually 1 for processed 
+        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
          video, grid_h, grid_w)
    """
    type: Literal["video_embeds"] = "video_embeds"
@ -234,7 +234,8 @@ class Glm4vVisionAttention(nn.Module):
            total_num_kv_heads=num_heads,
            bias=False,
            quant_config=quant_config,
-            prefix=f"{prefix}.qkv",
+            # Change qkv prefix to align with GLM-4.5V-FP8 quantization config
            prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
        )
        self.proj = RowParallelLinear(
            input_size=projection_size,