# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import weakref
from contextlib import ExitStack

import pytest

from tests.utils import wait_for_gpu_memory_to_clear
from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
from vllm import LLM
from vllm.config import CompilationConfig, CompilationMode
from vllm.platforms import current_platform

# test attention backend and cudagraph_mode combo
# (backend_name, cudagraph_mode, supported)
if current_platform.is_rocm():
    combo_cases_1 = [
        ("RocmAttn", "FULL", True),
        ("RocmAttn", "FULL_AND_PIECEWISE", True),
        ("TritonAttn", "FULL", True),
        ("TritonAttn", "FULL_AND_PIECEWISE", True),
    ]
else:
    combo_cases_1 = [
        ("FA3", "FULL", True),
        ("FA3", "FULL_AND_PIECEWISE", True),
        ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
        ("FA2", "FULL_AND_PIECEWISE", True),
        ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
        ("FlashInfer", "FULL_AND_PIECEWISE", True),
    ]


@pytest.mark.parametrize("backend_name, cudagraph_mode, supported", combo_cases_1)
def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supported):
    if backend_name == "FlashInfer":
        try:
            import flashinfer  # noqa: F401
        except ImportError:
            pytest.skip("FlashInfer is not installed")
    backend_config = backend_configs[backend_name]
    # Dynamically skip test if GPU capability is not met
    if (
        backend_config.specific_gpu_arch
        and backend_config.specific_gpu_arch != current_platform.get_device_capability()
    ):
        pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")

    attention_config = backend_config.attention_config

    with ExitStack() as stack:
        if not supported:
            stack.enter_context(pytest.raises(Exception))

        llm = LLM(
            model="Qwen/Qwen2-1.5B-Instruct",
            max_num_seqs=256,
            trust_remote_code=True,
            gpu_memory_utilization=0.45,
            max_model_len=1024,
            attention_config=attention_config,
            compilation_config=CompilationConfig(
                mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode
            ),
        )
        llm.generate(["Hello, my name is"] * 10)
    # when above code raises, `llm` may be undefined, so we need to catch that
    try:
        llm = weakref.proxy(llm)
        del llm
    except UnboundLocalError:
        pass

    wait_for_gpu_memory_to_clear(
        devices=[0],
        threshold_ratio=0.1,
    )


# test cudagraph_mode with different compilation mode.
# (backend_name, cudagraph_mode, compilation_mode, supported)
attn_backend = "RocmAttn" if current_platform.is_rocm() else "FA2"

combo_cases_2 = [
    (attn_backend, "FULL", CompilationMode.NONE, True),
    (attn_backend, "FULL", CompilationMode.VLLM_COMPILE, True),
    (attn_backend, "PIECEWISE", CompilationMode.NONE, True),
    (attn_backend, "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
    (attn_backend, "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
    (attn_backend, "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
    (attn_backend, "FULL_DECODE_ONLY", CompilationMode.NONE, True),
    (attn_backend, "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
    (attn_backend, "NONE", CompilationMode.NONE, True),
    (attn_backend, "NONE", CompilationMode.VLLM_COMPILE, True),
]


@pytest.mark.parametrize(
    "backend_name,cudagraph_mode,compilation_mode,supported", combo_cases_2
)
def test_cudagraph_compilation_combo(
    backend_name, cudagraph_mode, compilation_mode, supported
):
    backend_config = backend_configs[backend_name]
    attention_config = backend_config.attention_config

    with ExitStack() as stack:
        if not supported:
            stack.enter_context(pytest.raises(Exception))

        llm = LLM(
            model="Qwen/Qwen2-1.5B-Instruct",
            max_num_seqs=256,
            trust_remote_code=True,
            gpu_memory_utilization=0.45,
            max_model_len=1024,
            attention_config=attention_config,
            compilation_config=CompilationConfig(
                mode=compilation_mode, cudagraph_mode=cudagraph_mode
            ),
        )
        llm.generate(["Hello, my name is"] * 10)
    # when above code raises, `llm` may be undefined, so we need to catch that
    try:
        llm = weakref.proxy(llm)
        del llm
    except UnboundLocalError:
        pass
    finally:
        wait_for_gpu_memory_to_clear(
            devices=[0],
            threshold_ratio=0.1,
        )