[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-03-16 06:57:07 +08:00 · 2025-12-24 00:42:30 -06:00 · 2025-12-24 00:42:30 -06:00 · 0247a91e00
commit 0247a91e00
parent 8ee90c83f8
26 changed files with 432 additions and 116 deletions
--- a/setup.py
+++ b/setup.py
@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
        sys.platform,
    )
    VLLM_TARGET_DEVICE = "empty"
-elif (
-    sys.platform.startswith("linux")
-    and torch.version.cuda is None
-    and os.getenv("VLLM_TARGET_DEVICE") is None
-    and torch.version.hip is None
-):
-    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
-    # fallback to cpu
-    VLLM_TARGET_DEVICE = "cpu"
+elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
+    if torch.version.hip is not None:
+        VLLM_TARGET_DEVICE = "rocm"
+        logger.info("Auto-detected ROCm")
+    elif torch.version.cuda is not None:
+        VLLM_TARGET_DEVICE = "cuda"
+        logger.info("Auto-detected CUDA")
+    else:
+        VLLM_TARGET_DEVICE = "cpu"


 def is_sccache_available() -> bool:
@ -108,20 +108,26 @@ class cmake_build_ext(build_ext):
                num_jobs = os.cpu_count()

        nvcc_threads = None
-        if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
-            # `nvcc_threads` is either the value of the NVCC_THREADS
-            # environment variable (if defined) or 1.
-            # when it is set, we reduce `num_jobs` to avoid
-            # overloading the system.
-            nvcc_threads = envs.NVCC_THREADS
-            if nvcc_threads is not None:
-                nvcc_threads = int(nvcc_threads)
-                logger.info(
-                    "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
-                )
-            else:
-                nvcc_threads = 1
-            num_jobs = max(1, num_jobs // nvcc_threads)
+        if _is_cuda() and CUDA_HOME is not None:
+            try:
+                nvcc_version = get_nvcc_cuda_version()
+                if nvcc_version >= Version("11.2"):
+                    # `nvcc_threads` is either the value of the NVCC_THREADS
+                    # environment variable (if defined) or 1.
+                    # when it is set, we reduce `num_jobs` to avoid
+                    # overloading the system.
+                    nvcc_threads = envs.NVCC_THREADS
+                    if nvcc_threads is not None:
+                        nvcc_threads = int(nvcc_threads)
+                        logger.info(
+                            "Using NVCC_THREADS=%d as the number of nvcc threads.",
+                            nvcc_threads,
+                        )
+                    else:
+                        nvcc_threads = 1
+                    num_jobs = max(1, num_jobs // nvcc_threads)
+            except Exception as e:
+                logger.warning("Failed to get NVCC version: %s", e)

        return num_jobs, nvcc_threads

@ -199,9 +205,9 @@ class cmake_build_ext(build_ext):
            # Default build tool to whatever cmake picks.
            build_tool = []
        # Make sure we use the nvcc from CUDA_HOME
-        if _is_cuda():
+        if _is_cuda() and CUDA_HOME is not None:
            cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
-        elif _is_hip():
+        elif _is_hip() and ROCM_HOME is not None:
            cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]

        other_cmake_args = os.environ.get("CMAKE_ARGS")
@ -339,6 +345,89 @@ class precompiled_wheel_utils:
            wheels = json.loads(resp.read().decode("utf-8"))
        return wheels, repo_url

+    @staticmethod
+    def is_rocm_system() -> bool:
+        """Detect ROCm without relying on torch (for build environment)."""
+        if os.getenv("ROCM_PATH"):
+            return True
+        if os.path.isdir("/opt/rocm"):
+            return True
+        if which("rocminfo") is not None:
+            return True
+        try:
+            import torch
+
+            return torch.version.hip is not None
+        except ImportError:
+            return False
+
+    @staticmethod
+    def find_local_rocm_wheel() -> str | None:
+        """Search for a local vllm wheel in common locations."""
+        import glob
+
+        for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
+            wheels = glob.glob(pattern)
+            if wheels:
+                return sorted(wheels)[-1]
+        return None
+
+    @staticmethod
+    def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
+        """Fetch the latest wheel URL from a PyPI-style simple index."""
+        import platform
+        from html.parser import HTMLParser
+        from urllib.parse import urljoin
+        from urllib.request import urlopen
+
+        arch = platform.machine()
+
+        class WheelLinkParser(HTMLParser):
+            def __init__(self):
+                super().__init__()
+                self.wheels = []
+
+            def handle_starttag(self, tag, attrs):
+                if tag == "a":
+                    for name, value in attrs:
+                        if name == "href" and value.endswith(".whl"):
+                            self.wheels.append(value)
+
+        simple_url = f"{index_url.rstrip('/')}/{package}/"
+        print(f"Fetching wheel list from {simple_url}")
+        with urlopen(simple_url) as resp:
+            html = resp.read().decode("utf-8")
+
+        parser = WheelLinkParser()
+        parser.feed(html)
+
+        for wheel in reversed(parser.wheels):
+            if arch in wheel:
+                if wheel.startswith("http"):
+                    return wheel
+                return urljoin(simple_url, wheel)
+
+        raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
+
+    @staticmethod
+    def determine_wheel_url_rocm() -> tuple[str, str | None]:
+        """Determine the precompiled wheel for ROCm."""
+        # Search for local wheel first
+        local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
+        if local_wheel is not None:
+            print(f"Found local ROCm wheel: {local_wheel}")
+            return local_wheel, None
+
+        # Fall back to AMD's PyPI index
+        index_url = os.getenv(
+            "VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
+        )
+        print(f"Fetching ROCm precompiled wheel from {index_url}")
+        wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
+        download_filename = wheel_url.split("/")[-1].split("#")[0]
+        print(f"Using ROCm precompiled wheel: {wheel_url}")
+        return wheel_url, download_filename
+
    @staticmethod
    def determine_wheel_url() -> tuple[str, str | None]:
        """
@ -359,6 +448,11 @@ class precompiled_wheel_utils:
            print(f"Using user-specified precompiled wheel location: {wheel_location}")
            return wheel_location, None
        else:
+            # ROCm: use local wheel or AMD's PyPI index
+            # TODO: When we have ROCm nightly wheels, we can update this logic.
+            if precompiled_wheel_utils.is_rocm_system():
+                return precompiled_wheel_utils.determine_wheel_url_rocm()
+
            import platform

            arch = platform.machine()
@ -465,6 +559,8 @@ class precompiled_wheel_utils:
                    "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                    "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                    "vllm/cumem_allocator.abi3.so",
+                    # ROCm-specific libraries
+                    "vllm/_rocm_C.abi3.so",
                ]

                flash_attn_regex = re.compile(
@ -601,6 +697,8 @@ def get_rocm_version():
    # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
    # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
    try:
+        if ROCM_HOME is None:
+            return None
        librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
        if not librocm_core_file.is_file():
            return None
@ -745,7 +843,9 @@ if _is_hip():

 if _is_cuda():
    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+    if envs.VLLM_USE_PRECOMPILED or (
+        CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
+    ):
        # FA3 requires CUDA 12.3 or later
        ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
        # Optional since this doesn't get built (produce an .so file) when
--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/openai/conftest.py
@ -5,6 +5,30 @@ import pytest
 from vllm.assets.audio import AudioAsset


+def add_attention_backend(server_args, attention_config):
+    """Append attention backend CLI arg if specified.
+
+    Args:
+        server_args: List of server arguments to extend in-place.
+        attention_config: Dict with 'backend' key, or None.
+    """
+    if attention_config and "backend" in attention_config:
+        server_args.extend(["--attention-backend", attention_config["backend"]])
+
+
+@pytest.fixture(scope="module")
+def rocm_aiter_fa_attention():
+    """Return attention config for transcription/translation tests on ROCm.
+
+    On ROCm, audio tests require ROCM_AITER_FA attention backend.
+    """
+    from vllm.platforms import current_platform
+
+    if current_platform.is_rocm():
+        return {"backend": "ROCM_AITER_FA"}
+    return None
+
+
@pytest.fixture
 def mary_had_lamb():
    path = AudioAsset("mary_had_lamb").get_local_path()
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
        {"role": "system", "content": "you are a helpful assistant"},
        {"role": "user", "content": "what is 1+1?"},
    ]
-
    # test single completion
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=5,
        logprobs=True,
        top_logprobs=5,
    )
@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
    assert len(chat_completion.choices) == 1

    choice = chat_completion.choices[0]
+
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=37, total_tokens=47
+        completion_tokens=5, prompt_tokens=37, total_tokens=42
    )

    message = choice.message
-    assert message.content is not None and len(message.content) >= 10
+    assert message.content is not None and len(message.content) >= 5
    assert message.role == "assistant"
    messages.append({"role": "assistant", "content": message.content})

@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=5,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest):
        "2",
        *passed_params,
    ]
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -504,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_code_interpreter(client: OpenAI, model_name: str):
-    response = await client.responses.create(
+    # Code interpreter may need more time for container init + code execution
+    timeout_value = client.timeout * 3
+    client_with_timeout = client.with_options(timeout=timeout_value)
+
+    response = await client_with_timeout.responses.create(
        model=model_name,
        # TODO: Ideally should be able to set max tool calls
        # to prevent multi-turn, but it is not currently supported
@ -868,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server):

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.flaky(reruns=3)
 async def test_function_call_with_previous_input_messages(
    client: OpenAI, model_name: str
 ):
--- a/tests/entrypoints/openai/test_serving_tokens.py
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
        add_generation_prompt=True,
        enable_thinking=False,  # default with Qwen3
    )
+
    for ignore_eos in [True, False]:
        payload = {
            "model": MODEL_NAME,
@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
        }
        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
        generate_data = generate_resp.json()
-        generate_res = tokenizer.decode(
-            generate_data["choices"][0]["token_ids"], skip_special_tokens=True
-        )
+        gen_token_ids = generate_data["choices"][0]["token_ids"]
+        generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True)

        payload = {
            "model": MODEL_NAME,
@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
            "temperature": 0.0,
            "stream": False,
            "ignore_eos": ignore_eos,
-            "chat_template_kwargs": dict(enable_thinking=False),
+            "chat_template_kwargs": {"enable_thinking": False},
        }
        completions_resp = await client.post("/v1/chat/completions", json=payload)
        completions_data = completions_resp.json()
        completions_res = completions_data["choices"][0]["message"]["content"]

+        if ignore_eos:
+            # When ignoring EOS, only compare up to the first EOS token
+            # Post-EOS generation is undefined and may differ
+            eos_tokens = {
+                tokenizer.eos_token_id,
+                *tokenizer.additional_special_tokens_ids,
+            }
+            # Find first EOS in generated tokens
+            eos_pos = None
+            for i, tid in enumerate(gen_token_ids):
+                if tid in eos_tokens:
+                    eos_pos = i
+                    break
+            if eos_pos is not None:
+                gen_token_ids_truncated = gen_token_ids[:eos_pos]
+                generate_res = tokenizer.decode(
+                    gen_token_ids_truncated, skip_special_tokens=True
+                )
+                # Truncate completions_res to same length for comparison
+                completions_res = completions_res[: len(generate_res)]
+
        assert generate_res == completions_res


--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@ -9,10 +9,16 @@ import time
 import openai
 import pytest

+from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_open_port

 MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"

+# GPU initialization might take take longer
+_IS_ROCM = current_platform.is_rocm()
+_SERVER_STARTUP_TIMEOUT = 120
+_PROCESS_EXIT_TIMEOUT = 15
+

@pytest.mark.asyncio
 async def test_shutdown_on_engine_failure():
@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure():
            "2",
            "--disable-frontend-multiprocessing",
        ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        text=True,
+        # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
+        # stdout/stderr pipes are enabled during ROCm GPU initialization.
+        stdout=None if _IS_ROCM else subprocess.PIPE,
+        stderr=None if _IS_ROCM else subprocess.PIPE,
+        text=None if _IS_ROCM else True,
        preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
    )

@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure():
    )

    # Poll until server is ready
-    while time.time() - start_time < 30:
+    while time.time() - start_time < _SERVER_STARTUP_TIMEOUT:
        try:
            await client.completions.create(
                model=MODEL_NAME, prompt="Hello", max_tokens=1
@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure():
        except Exception:
            time.sleep(0.5)
            if proc.poll() is not None:
-                stdout, stderr = proc.communicate(timeout=1)
-                pytest.fail(
-                    f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
-                )
+                if _IS_ROCM:
+                    pytest.fail(f"Server died during startup: {proc.returncode}")
+                else:
+                    stdout, stderr = proc.communicate(timeout=1)
+                    pytest.fail(
+                        f"Server died during startup. "
+                        f"stdout: {stdout}, stderr: {stderr}"
+                    )
    else:
        proc.terminate()
-        proc.wait(timeout=5)
-        pytest.fail("Server failed to start in 30 seconds")
+        proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
+        pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds")

    # Kill server to simulate crash
    proc.terminate()
@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure():
            model=MODEL_NAME, prompt="This should fail", max_tokens=1
        )

-    return_code = proc.wait(timeout=5)
+    return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
    assert return_code is not None
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -7,6 +7,7 @@ import json
 import pytest

 from ...utils import RemoteOpenAIServer
+from .conftest import add_attention_backend

 MISTRAL_FORMAT_ARGS = [
    "--tokenizer_mode",
@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
-async def test_basic_audio(mary_had_lamb, model_name):
+async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
    server_args = ["--enforce-eager"]

    if model_name.startswith("mistralai"):
        server_args += MISTRAL_FORMAT_ARGS

+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name):


@pytest.mark.asyncio
-async def test_basic_audio_with_lora(mary_had_lamb):
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
    """Ensure STT (transcribe) requests can pass LoRA through to generate."""
+    # ROCm SPECIFIC CONFIGURATION:
+    # To ensure the test passes on ROCm, we modify the max model length to 512.
+    # We DO NOT apply this to other platforms to maintain strict upstream parity.
+    from vllm.platforms import current_platform
+
    model_name = "ibm-granite/granite-speech-3.3-2b"
    lora_model_name = "speech"
    server_args = [
@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
        "--lora-modules",
        f"{lora_model_name}={model_name}",
        "--max-model-len",
-        "2048",
+        "512" if current_platform.is_rocm() else "2048",
        "--max-num-seqs",
        "1",
    ]

+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb):


@pytest.mark.asyncio
-async def test_basic_audio_gemma(foscolo):
+async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention):
    # Gemma accuracy on some of the audio samples we use is particularly bad,
    # hence we use a different one here. WER is evaluated separately.
    model_name = "google/gemma-3n-E2B-it"
    server_args = ["--enforce-eager"]

+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
    with RemoteOpenAIServer(
        model_name, server_args, max_wait_seconds=480
    ) as remote_server:
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@ -14,16 +14,26 @@ import pytest_asyncio
 import soundfile as sf

 from ...utils import RemoteOpenAIServer
+from .conftest import add_attention_backend

 SERVER_ARGS = ["--enforce-eager"]


+def _get_server_args(attention_config):
+    """Get server args with attention backend if specified."""
+    args = SERVER_ARGS.copy()
+    add_attention_backend(args, attention_config)
+    return args
+
+
@pytest.fixture(
    scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
 )
-def server(request):
+def server(request, rocm_aiter_fa_attention):
    # Parametrize over model name
-    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+    with RemoteOpenAIServer(
+        request.param, _get_server_args(rocm_aiter_fa_attention)
+    ) as remote_server:
        yield remote_server, request.param


@ -35,10 +45,12 @@ async def client_and_model(server):


@pytest.mark.asyncio
-async def test_non_asr_model(foscolo):
+async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
    # text to text model
    model_name = "JackFram/llama-68m"
-    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, _get_server_args(rocm_aiter_fa_attention)
+    ) as remote_server:
        client = remote_server.get_async_client()
        res = await client.audio.translations.create(
            model=model_name, file=foscolo, temperature=0.0
@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):


@pytest.mark.asyncio
-async def test_basic_audio_with_lora(mary_had_lamb):
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
    """Ensure STT (translate) requests can pass LoRA through to generate."""
+    # ROCm SPECIFIC CONFIGURATION:
+    # To ensure the test passes on ROCm, we modify the max model length to 512.
+    # We DO NOT apply this to other platforms to maintain strict upstream parity.
+    from vllm.platforms import current_platform
+
    # NOTE - careful to call this test before the module scoped server
    # fixture, otherwise it'll OOMkill the CI
    model_name = "ibm-granite/granite-speech-3.3-2b"
@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
        "--lora-modules",
        f"{lora_model_name}={model_name}",
        "--max-model-len",
-        "2048",
+        "512" if current_platform.is_rocm() else "2048",
        "--max-num-seqs",
        "1",
    ]

+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@ -8,6 +8,7 @@ import pytest
 import pytest_asyncio

 from vllm.multimodal.utils import encode_video_url, fetch_video
+from vllm.platforms import current_platform

 from ...utils import RemoteOpenAIServer

@ -37,7 +38,16 @@ def server():
        json.dumps({"video": MAXIMUM_VIDEOS}),
    ]

-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # ROCm: Increase timeouts to handle potential network delays and slower
+    # video processing when downloading multiple videos from external sources
+    env_overrides = {}
+    if current_platform.is_rocm():
+        env_overrides = {
+            "VLLM_VIDEO_FETCH_TIMEOUT": "120",
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
+        }
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
        yield remote_server


@ -287,6 +297,11 @@ async def test_chat_streaming_video(
@pytest.mark.parametrize(
    "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
 )
+@pytest.mark.flaky(
+    reruns=2,
+    reruns_delay=5,
+    condition=current_platform.is_rocm(),
+)
 async def test_multi_video_input(
    client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
 ):
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@ -10,6 +10,7 @@ from transformers import AutoProcessor

 from vllm.multimodal.base import MediaWithBytes
 from vllm.multimodal.utils import encode_image_url, fetch_image
+from vllm.platforms import current_platform

 from ...utils import RemoteOpenAIServer

@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
    ],
 ]

+EXPECTED_MM_BEAM_SEARCH_RES_ROCM = [
+    # MultiHeadAttention attn_backend: FLASH_ATTN
+    # with Triton Attention backend
+    [
+        "The image shows a wooden boardwalk leading through a",
+        "The image shows a wooden boardwalk extending into a",
+    ],
+    [
+        "The image shows two parrots perched on",
+        "The image shows two birds perched on a cur",
+    ],
+    [
+        "The image shows a Venn diagram with three over",
+        "The image contains a Venn diagram with three over",
+    ],
+    [
+        "This image displays a gradient of colors ranging from",
+        "This image displays a gradient of colors transitioning from",
+    ],
+]
+

@pytest.fixture(scope="module")
 def server():
@ -59,7 +81,16 @@ def server():
        json.dumps({"image": MAXIMUM_IMAGES}),
    ]

-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # ROCm: Increase timeouts to handle potential network delays and slower
+    # video processing when downloading multiple videos from external sources
+    env_overrides = {}
+    if current_platform.is_rocm():
+        env_overrides = {
+            "VLLM_VIDEO_FETCH_TIMEOUT": "120",
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
+        }
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
        yield remote_server


@ -288,9 +319,16 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
    image_idx: int,
    url_encoded_image: dict[str, str],
 ):
+    # ROCm: Switch expected results based on platform
+    from vllm.platforms import current_platform
+
    # NOTE: This test also validates that we pass MM data through beam search
    raw_image_url = TEST_IMAGE_ASSETS[image_idx]
-    expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
+
+    if current_platform.is_rocm():
+        expected_res = EXPECTED_MM_BEAM_SEARCH_RES_ROCM[image_idx]
+    else:
+        expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]

    messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])

--- a/tests/entrypoints/openai/test_vision_embeds.py
+++ b/tests/entrypoints/openai/test_vision_embeds.py
@ -33,6 +33,7 @@ def _terratorch_dummy_messages():
    ]


+@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
 )
--- a/tests/entrypoints/pooling/basic/test_encode.py
+++ b/tests/entrypoints/pooling/basic/test_encode.py
@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "intfloat/multilingual-e5-small"

 PROMPTS = [
@ -35,6 +30,12 @@ TOKEN_IDS = [

@pytest.fixture(scope="module")
 def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
@ -44,6 +45,7 @@ def llm():
        gpu_memory_utilization=0.75,
        enforce_eager=True,
        seed=0,
+        attention_config=attention_config,
    )

    yield weakref.proxy(llm)
--- a/tests/entrypoints/pooling/basic/test_truncation.py
+++ b/tests/entrypoints/pooling/basic/test_truncation.py
@ -9,11 +9,6 @@ import pytest_asyncio
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
 max_model_len = 128

@ -44,6 +39,10 @@ def server():
        str(max_model_len),
    ]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/embed/conftest.py
+++ b/tests/entrypoints/pooling/embed/conftest.py
@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM pooling embed tests."""
+
+import warnings
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_collection_modifyitems(config, items):
+    """Configure ROCm-specific settings based on collected tests."""
+    if not current_platform.is_rocm():
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
--- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

 MODEL_NAME = "intfloat/e5-small"
@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667
 def server():
    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/embed/test_offline.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "intfloat/multilingual-e5-small"

 prompts = ["The chef prepared a delicious meal."]
@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."]

@pytest.fixture(scope="module")
 def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
@ -32,6 +33,7 @@ def llm():
        gpu_memory_utilization=0.75,
        enforce_eager=True,
        seed=0,
+        attention_config=attention_config,
    )

    yield weakref.proxy(llm)
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
    decode_pooling_output,
 )

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 DTYPE = "bfloat16"


+if current_platform.is_rocm():
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+
+
@pytest.fixture(scope="module")
 def server():
    args = [
@ -53,6 +57,10 @@ def server():
        DUMMY_CHAT_TEMPLATE,
    ]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/embed/test_online_dimensions.py
+++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py
@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODELS = [
    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
    EmbedModelInfo(
@ -62,6 +57,10 @@ def server(model_info, dtype: str):
            ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
        )

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(model_info.name, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/embed/test_online_long_text.py
+++ b/tests/entrypoints/pooling/embed/test_online_long_text.py
@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-

 def _generate_random_text(word_count: int) -> str:
    """Generate random text with approximately the specified word count."""
@ -228,6 +223,10 @@ def server_with_chunked_processing():
        "0.8",
    ]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/score/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py
@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

 MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@ -30,6 +25,10 @@ st_main_score = 0.33457
 def server():
    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/score/test_offline.py
+++ b/tests/entrypoints/pooling/score/test_offline.py
@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"


@pytest.fixture(scope="module")
 def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
@ -30,6 +31,7 @@ def llm():
        gpu_memory_utilization=0.75,
        enforce_eager=True,
        seed=0,
+        attention_config=attention_config,
    )

    yield weakref.proxy(llm)
--- a/tests/entrypoints/pooling/score/test_online_rerank.py
+++ b/tests/entrypoints/pooling/score/test_online_rerank.py
@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.entrypoints.pooling.score.protocol import RerankResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"

@ -24,6 +19,10 @@ DTYPE = "bfloat16"
 def server():
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/score/test_online_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.score.protocol import ScoreResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODELS = [
    {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
    {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
@ -44,6 +39,10 @@ def model(request):
 def server(model: dict[str, Any]):
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(model["name"], args) as remote_server:
        yield remote_server

--- a/tests/standalone_tests/pytorch_nightly_dependency.sh
+++ b/tests/standalone_tests/pytorch_nightly_dependency.sh
@ -4,6 +4,11 @@
 set -e
 set -x

+if command -v rocminfo >/dev/null 2>&1; then
+  echo "Skipping test for ROCm platform"
+  exit 0
+fi
+
 cd /vllm-workspace/

 rm -rf .venv
@ -36,7 +41,7 @@ if diff before.txt after.txt; then
  echo "torch version not overridden."
 else
  echo "torch version overridden by nightly_torch_test.txt, \
-  if the dependency is not triggered by the pytroch nightly test,\
+  if the dependency is not triggered by the pytorch nightly test,\
  please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py"
  exit 1
 fi
--- a/vllm/entrypoints/pooling/embed/conftest.py
+++ b/vllm/entrypoints/pooling/embed/conftest.py
@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM pooling embed tests."""
+
+import warnings
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_collection_modifyitems(config, items):
+    """Configure ROCm-specific settings based on collected tests."""
+    if not current_platform.is_rocm():
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )