[ROCm][CI][Bugfix] Fixing the Multi-Modal Models Test (Extended) 1 group (#30013)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-05-23 20:57:55 +08:00 · 2025-12-04 05:00:16 -06:00 · 2025-12-04 05:00:16 -06:00 · e96a6a6dca
commit e96a6a6dca
parent 6366c098d7
10 changed files with 139 additions and 9 deletions
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -987,7 +987,8 @@ steps:
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1
+- label: Multi-Modal Models Test (Extended) 1 # 60min
  timeout_in_minutes: 120
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
@ -1011,7 +1012,8 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models Test (Extended) 3
+- label: Multi-Modal Models Test (Extended) 3 # 75min
  timeout_in_minutes: 150
  mirror_hardwares: [amdexperimental]
  agent_pool: mi325_1
  # grade: Blocking
--- a/tests/models/multimodal/generation/conftest.py
+++ b/tests/models/multimodal/generation/conftest.py
@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM tests."""
 import warnings
 import torch
 from vllm.platforms import current_platform
@ -14,6 +16,20 @@ def pytest_configure(config):
    if not current_platform.is_rocm():
        return
    skip_patterns = ["test_granite_speech.py"]
    if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
        # Skip disabling SDP for Granite Speech tests on ROCm
        return
    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
    # accuracy issues
    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
    torch.backends.cuda.enable_flash_sdp(False)
    torch.backends.cuda.enable_mem_efficient_sdp(False)
    torch.backends.cuda.enable_math_sdp(True)
    warnings.warn(
        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
        "to avoid HuggingFace Transformers accuracy issues",
        UserWarning,
        stacklevel=1,
    )
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -403,12 +403,13 @@ VLM_TEST_SETTINGS = {
        # So, we need to reduce the number of tokens for the test to pass.
        max_tokens=8,
        num_logprobs=10,
        auto_cls=AutoModelForCausalLM,
        marks=[large_gpu_mark(min_gb=32)],
    ),
    "glm4_1v": VLMTestInfo(
        models=["zai-org/GLM-4.1V-9B-Thinking"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
+        prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
        max_model_len=2048,
@ -423,6 +424,7 @@ VLM_TEST_SETTINGS = {
        models=["zai-org/GLM-4.1V-9B-Thinking"],
        # GLM4.1V require include video metadata for input
        test_type=VLMTestType.CUSTOM_INPUTS,
        prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
@ -737,7 +739,13 @@ VLM_TEST_SETTINGS = {
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=48)],
+        marks=[
            large_gpu_mark(min_gb=48),
            pytest.mark.skipif(
                current_platform.is_rocm(),
                reason="Model produces a vector of <UNK> output in HF on ROCm",
            ),
        ],
    ),
    "qwen_vl": VLMTestInfo(
        models=["Qwen/Qwen-VL"],
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@ -8,6 +8,7 @@ from transformers import AutoModelForSpeechSeq2Seq
 from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
 from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
 from ...registry import HF_EXAMPLE_MODELS
@ -34,6 +35,12 @@ audio_lora_path = MODEL_NAME
 models = [MODEL_NAME]
@pytest.fixture(autouse=True)
 def set_attention_backend_for_rocm(monkeypatch):
    if current_platform.is_rocm():
        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
@ -111,8 +118,12 @@ def run_test(
@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize(
-@pytest.mark.parametrize("max_model_len", [2048])
+    "dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
 )
@pytest.mark.parametrize(
    "max_model_len", [512] if current_platform.is_rocm() else [2048]
 )
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
 def test_models(
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@ -15,6 +15,7 @@ from transformers import AutoProcessor
 from vllm import SamplingParams, TextPrompt, TokensPrompt
 from vllm.logprobs import Logprob, SampleLogprobs
 from vllm.multimodal import MultiModalDataBuiltins
 from vllm.platforms import current_platform
 from ....utils import VLLM_PATH, large_gpu_test
 from ...utils import check_logprobs_close
@ -165,6 +166,15 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
 def test_chat(
    vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
 ) -> None:
    if (
        model == MISTRAL_SMALL_3_1_ID
        and max_model_len == 65536
        and current_platform.is_rocm()
    ):
        pytest.skip(
            "OOM on ROCm: 24B model with 65536 context length exceeds GPU memory"
        )
    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
    with vllm_runner(
        model,
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@ -140,7 +140,7 @@ def video_with_metadata_glm4_1v():
    metadata = VIDEO_ASSETS[0].metadata
    question = "Describe the video."
    video_prompt = "<|begin_of_video|><|video|><|end_of_video|>"
-    formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
+    formatted_prompt = f"[gMASK]<|user|>\n{video_prompt}{question}<|assistant|>\n"
    scales = [0.1, 0.2, 0.25]
    video_input = [
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@ -25,6 +25,7 @@ from transformers import (
 from transformers.video_utils import VideoMetadata
 from vllm.logprobs import SampleLogprobs
 from vllm.platforms import current_platform
 from vllm.utils.collection_utils import is_list_of
 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
@ -366,6 +367,40 @@ def gemma3_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOut
 def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for GLM4V."""
    if current_platform.is_rocm():
        import types
        config = hf_model.model.config
        if hasattr(config, "num_layers") and not hasattr(config, "num_hidden_layers"):
            config.num_hidden_layers = config.num_layers
        config.output_hidden_states = True
        def patched_prepare_cache(
            self, generation_config, model_kwargs, *args, **kwargs
        ):
            model_kwargs["past_key_values"] = None
            model_kwargs["use_cache"] = False
            return model_kwargs
        hf_model.model._prepare_cache_for_generation = types.MethodType(
            patched_prepare_cache, hf_model.model
        )
        original_generate = hf_model.model.generate
        def patched_generate(*args, **kwargs):
            kwargs["output_hidden_states"] = True
            kwargs["return_dict_in_generate"] = True
            return original_generate(*args, **kwargs)
        hf_model.model.generate = patched_generate
        original_forward = hf_model.model.forward
        def patched_forward(*args, **kwargs):
            kwargs["output_hidden_states"] = True
            return original_forward(*args, **kwargs)
        hf_model.model.forward = patched_forward
    hf_processor = hf_model.processor
    def processor(*args, text="", images=None, **kwargs):
@ -406,7 +441,15 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        if videos is not None and is_list_of(videos, tuple):
            # If videos is a list of tuples, we assume each tuple contains
            # (video_array, metadata) as in the case of GLM4.1V.
-            video_metadata = [[VideoMetadata(**video[1])] for video in videos]
+            # Filter out 'do_sample_frames' as it's not a valid VideoMetadata arg
            video_metadata = [
                [
                    VideoMetadata(
                        **{k: v for k, v in video[1].items() if k != "do_sample_frames"}
                    )
                ]
                for video in videos
            ]
            videos = [[video[0]] for video in videos]
        else:
            video_metadata = None
--- a/tests/models/multimodal/pooling/conftest.py
+++ b/tests/models/multimodal/pooling/conftest.py
@ -0,0 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM pooling tests."""
 import os
 import warnings
 from vllm.platforms import current_platform
 def pytest_collection_modifyitems(config, items):
    """Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
    if not current_platform.is_rocm():
        return
    siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
    if siglip_tests:
        os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
        warnings.warn(
            "ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
            UserWarning,
            stacklevel=1,
        )
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -667,6 +667,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "moonshotai/Kimi-VL-A3B-Instruct",
        extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},
        trust_remote_code=True,
        max_transformers_version="4.53.3",
        transformers_version_reason="HF model uses deprecated transformers API "
        "(PytorchGELUTanh, DynamicCache.seen_tokens, and more). See: "
        "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31",
    ),
    "LightOnOCRForConditionalGeneration": _HfExamplesInfo(
        "lightonai/LightOnOCR-1B",
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@ -31,6 +31,7 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
    vllm_is_batch_invariant,
 )
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 from vllm.v1.attention.backends.utils import (
@ -927,7 +928,18 @@ def get_kernel_options(
        if torch.cuda.is_available():
            device_props = torch.cuda.get_device_properties()
-            max_shared_memory = device_props.shared_memory_per_block_optin
+            # ROCm doesn't expose shared_memory_per_block_optin attribute
            # AMD GPUs typically have 64KB LDS (Local Data Share) per workgroup
            if hasattr(device_props, "shared_memory_per_block_optin"):
                max_shared_memory = device_props.shared_memory_per_block_optin
            elif current_platform.is_rocm():
                # ROCm fallback: use 64KB
                max_shared_memory = 65536
            else:
                raise RuntimeError(
                    "Unable to determine shared memory size on this hardware."
                )
            if max_shared_memory < 144 * 1024:
                block_m_candidate = ensure_divisible(
                    max(1, block_m_candidate // 2), block_m