diff --git a/setup.py b/setup.py index 6fcb6653bc4a3..581d3c80c3d06 100644 --- a/setup.py +++ b/setup.py @@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin")) sys.platform, ) VLLM_TARGET_DEVICE = "empty" -elif ( - sys.platform.startswith("linux") - and torch.version.cuda is None - and os.getenv("VLLM_TARGET_DEVICE") is None - and torch.version.hip is None -): - # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set, - # fallback to cpu - VLLM_TARGET_DEVICE = "cpu" +elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None: + if torch.version.hip is not None: + VLLM_TARGET_DEVICE = "rocm" + logger.info("Auto-detected ROCm") + elif torch.version.cuda is not None: + VLLM_TARGET_DEVICE = "cuda" + logger.info("Auto-detected CUDA") + else: + VLLM_TARGET_DEVICE = "cpu" def is_sccache_available() -> bool: @@ -108,20 +108,26 @@ class cmake_build_ext(build_ext): num_jobs = os.cpu_count() nvcc_threads = None - if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"): - # `nvcc_threads` is either the value of the NVCC_THREADS - # environment variable (if defined) or 1. - # when it is set, we reduce `num_jobs` to avoid - # overloading the system. - nvcc_threads = envs.NVCC_THREADS - if nvcc_threads is not None: - nvcc_threads = int(nvcc_threads) - logger.info( - "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads - ) - else: - nvcc_threads = 1 - num_jobs = max(1, num_jobs // nvcc_threads) + if _is_cuda() and CUDA_HOME is not None: + try: + nvcc_version = get_nvcc_cuda_version() + if nvcc_version >= Version("11.2"): + # `nvcc_threads` is either the value of the NVCC_THREADS + # environment variable (if defined) or 1. + # when it is set, we reduce `num_jobs` to avoid + # overloading the system. + nvcc_threads = envs.NVCC_THREADS + if nvcc_threads is not None: + nvcc_threads = int(nvcc_threads) + logger.info( + "Using NVCC_THREADS=%d as the number of nvcc threads.", + nvcc_threads, + ) + else: + nvcc_threads = 1 + num_jobs = max(1, num_jobs // nvcc_threads) + except Exception as e: + logger.warning("Failed to get NVCC version: %s", e) return num_jobs, nvcc_threads @@ -199,9 +205,9 @@ class cmake_build_ext(build_ext): # Default build tool to whatever cmake picks. build_tool = [] # Make sure we use the nvcc from CUDA_HOME - if _is_cuda(): + if _is_cuda() and CUDA_HOME is not None: cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"] - elif _is_hip(): + elif _is_hip() and ROCM_HOME is not None: cmake_args += [f"-DROCM_PATH={ROCM_HOME}"] other_cmake_args = os.environ.get("CMAKE_ARGS") @@ -339,6 +345,89 @@ class precompiled_wheel_utils: wheels = json.loads(resp.read().decode("utf-8")) return wheels, repo_url + @staticmethod + def is_rocm_system() -> bool: + """Detect ROCm without relying on torch (for build environment).""" + if os.getenv("ROCM_PATH"): + return True + if os.path.isdir("/opt/rocm"): + return True + if which("rocminfo") is not None: + return True + try: + import torch + + return torch.version.hip is not None + except ImportError: + return False + + @staticmethod + def find_local_rocm_wheel() -> str | None: + """Search for a local vllm wheel in common locations.""" + import glob + + for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]: + wheels = glob.glob(pattern) + if wheels: + return sorted(wheels)[-1] + return None + + @staticmethod + def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str: + """Fetch the latest wheel URL from a PyPI-style simple index.""" + import platform + from html.parser import HTMLParser + from urllib.parse import urljoin + from urllib.request import urlopen + + arch = platform.machine() + + class WheelLinkParser(HTMLParser): + def __init__(self): + super().__init__() + self.wheels = [] + + def handle_starttag(self, tag, attrs): + if tag == "a": + for name, value in attrs: + if name == "href" and value.endswith(".whl"): + self.wheels.append(value) + + simple_url = f"{index_url.rstrip('/')}/{package}/" + print(f"Fetching wheel list from {simple_url}") + with urlopen(simple_url) as resp: + html = resp.read().decode("utf-8") + + parser = WheelLinkParser() + parser.feed(html) + + for wheel in reversed(parser.wheels): + if arch in wheel: + if wheel.startswith("http"): + return wheel + return urljoin(simple_url, wheel) + + raise ValueError(f"No compatible wheel found for {arch} at {simple_url}") + + @staticmethod + def determine_wheel_url_rocm() -> tuple[str, str | None]: + """Determine the precompiled wheel for ROCm.""" + # Search for local wheel first + local_wheel = precompiled_wheel_utils.find_local_rocm_wheel() + if local_wheel is not None: + print(f"Found local ROCm wheel: {local_wheel}") + return local_wheel, None + + # Fall back to AMD's PyPI index + index_url = os.getenv( + "VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple" + ) + print(f"Fetching ROCm precompiled wheel from {index_url}") + wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url) + download_filename = wheel_url.split("/")[-1].split("#")[0] + print(f"Using ROCm precompiled wheel: {wheel_url}") + return wheel_url, download_filename + @staticmethod def determine_wheel_url() -> tuple[str, str | None]: """ @@ -359,6 +448,11 @@ class precompiled_wheel_utils: print(f"Using user-specified precompiled wheel location: {wheel_location}") return wheel_location, None else: + # ROCm: use local wheel or AMD's PyPI index + # TODO: When we have ROCm nightly wheels, we can update this logic. + if precompiled_wheel_utils.is_rocm_system(): + return precompiled_wheel_utils.determine_wheel_url_rocm() + import platform arch = platform.machine() @@ -465,6 +559,8 @@ class precompiled_wheel_utils: "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/cumem_allocator.abi3.so", + # ROCm-specific libraries + "vllm/_rocm_C.abi3.so", ] flash_attn_regex = re.compile( @@ -601,6 +697,8 @@ def get_rocm_version(): # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21 try: + if ROCM_HOME is None: + return None librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so" if not librocm_core_file.is_file(): return None @@ -745,7 +843,9 @@ if _is_hip(): if _is_cuda(): ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) - if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"): + if envs.VLLM_USE_PRECOMPILED or ( + CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3") + ): # FA3 requires CUDA 12.3 or later ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) # Optional since this doesn't get built (produce an .so file) when diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py index b40079d8dc3d5..098a9a72325ba 100644 --- a/tests/entrypoints/openai/conftest.py +++ b/tests/entrypoints/openai/conftest.py @@ -5,6 +5,30 @@ import pytest from vllm.assets.audio import AudioAsset +def add_attention_backend(server_args, attention_config): + """Append attention backend CLI arg if specified. + + Args: + server_args: List of server arguments to extend in-place. + attention_config: Dict with 'backend' key, or None. + """ + if attention_config and "backend" in attention_config: + server_args.extend(["--attention-backend", attention_config["backend"]]) + + +@pytest.fixture(scope="module") +def rocm_aiter_fa_attention(): + """Return attention config for transcription/translation tests on ROCm. + + On ROCm, audio tests require ROCM_AITER_FA attention backend. + """ + from vllm.platforms import current_platform + + if current_platform.is_rocm(): + return {"backend": "ROCM_AITER_FA"} + return None + + @pytest.fixture def mary_had_lamb(): path = AudioAsset("mary_had_lamb").get_local_path() diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index b2909f21e4dd8..ab78a79774564 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): {"role": "system", "content": "you are a helpful assistant"}, {"role": "user", "content": "what is 1+1?"}, ] - # test single completion chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_completion_tokens=10, + max_completion_tokens=5, logprobs=True, top_logprobs=5, ) @@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] + assert choice.finish_reason == "length" assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=37, total_tokens=47 + completion_tokens=5, prompt_tokens=37, total_tokens=42 ) message = choice.message - assert message.content is not None and len(message.content) >= 10 + assert message.content is not None and len(message.content) >= 5 assert message.role == "assistant" messages.append({"role": "assistant", "content": message.content}) @@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_completion_tokens=10, + max_completion_tokens=5, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 diff --git a/tests/entrypoints/openai/test_optional_middleware.py b/tests/entrypoints/openai/test_optional_middleware.py index b67d6147937d1..c2c7fbdb01140 100644 --- a/tests/entrypoints/openai/test_optional_middleware.py +++ b/tests/entrypoints/openai/test_optional_middleware.py @@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest): "2", *passed_params, ] + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 8ef0d7f277d5f..718e0edba8373 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -504,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_code_interpreter(client: OpenAI, model_name: str): - response = await client.responses.create( + # Code interpreter may need more time for container init + code execution + timeout_value = client.timeout * 3 + client_with_timeout = client.with_options(timeout=timeout_value) + + response = await client_with_timeout.responses.create( model=model_name, # TODO: Ideally should be able to set max tool calls # to prevent multi-turn, but it is not currently supported @@ -868,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.flaky(reruns=3) async def test_function_call_with_previous_input_messages( client: OpenAI, model_name: str ): diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py index 62d843e35b86f..acbbaa659c82b 100644 --- a/tests/entrypoints/openai/test_serving_tokens.py +++ b/tests/entrypoints/openai/test_serving_tokens.py @@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages): add_generation_prompt=True, enable_thinking=False, # default with Qwen3 ) + for ignore_eos in [True, False]: payload = { "model": MODEL_NAME, @@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages): } generate_resp = await client.post(GEN_ENDPOINT, json=payload) generate_data = generate_resp.json() - generate_res = tokenizer.decode( - generate_data["choices"][0]["token_ids"], skip_special_tokens=True - ) + gen_token_ids = generate_data["choices"][0]["token_ids"] + generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True) payload = { "model": MODEL_NAME, @@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages): "temperature": 0.0, "stream": False, "ignore_eos": ignore_eos, - "chat_template_kwargs": dict(enable_thinking=False), + "chat_template_kwargs": {"enable_thinking": False}, } completions_resp = await client.post("/v1/chat/completions", json=payload) completions_data = completions_resp.json() completions_res = completions_data["choices"][0]["message"]["content"] + if ignore_eos: + # When ignoring EOS, only compare up to the first EOS token + # Post-EOS generation is undefined and may differ + eos_tokens = { + tokenizer.eos_token_id, + *tokenizer.additional_special_tokens_ids, + } + # Find first EOS in generated tokens + eos_pos = None + for i, tid in enumerate(gen_token_ids): + if tid in eos_tokens: + eos_pos = i + break + if eos_pos is not None: + gen_token_ids_truncated = gen_token_ids[:eos_pos] + generate_res = tokenizer.decode( + gen_token_ids_truncated, skip_special_tokens=True + ) + # Truncate completions_res to same length for comparison + completions_res = completions_res[: len(generate_res)] + assert generate_res == completions_res diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index d75119cb7b43d..a2ac49bcb0b25 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -9,10 +9,16 @@ import time import openai import pytest +from vllm.platforms import current_platform from vllm.utils.network_utils import get_open_port MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" +# GPU initialization might take take longer +_IS_ROCM = current_platform.is_rocm() +_SERVER_STARTUP_TIMEOUT = 120 +_PROCESS_EXIT_TIMEOUT = 15 + @pytest.mark.asyncio async def test_shutdown_on_engine_failure(): @@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure(): "2", "--disable-frontend-multiprocessing", ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, + # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when + # stdout/stderr pipes are enabled during ROCm GPU initialization. + stdout=None if _IS_ROCM else subprocess.PIPE, + stderr=None if _IS_ROCM else subprocess.PIPE, + text=None if _IS_ROCM else True, preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN), ) @@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure(): ) # Poll until server is ready - while time.time() - start_time < 30: + while time.time() - start_time < _SERVER_STARTUP_TIMEOUT: try: await client.completions.create( model=MODEL_NAME, prompt="Hello", max_tokens=1 @@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure(): except Exception: time.sleep(0.5) if proc.poll() is not None: - stdout, stderr = proc.communicate(timeout=1) - pytest.fail( - f"Server died during startup. stdout: {stdout}, stderr: {stderr}" - ) + if _IS_ROCM: + pytest.fail(f"Server died during startup: {proc.returncode}") + else: + stdout, stderr = proc.communicate(timeout=1) + pytest.fail( + f"Server died during startup. " + f"stdout: {stdout}, stderr: {stderr}" + ) else: proc.terminate() - proc.wait(timeout=5) - pytest.fail("Server failed to start in 30 seconds") + proc.wait(timeout=_PROCESS_EXIT_TIMEOUT) + pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds") # Kill server to simulate crash proc.terminate() @@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure(): model=MODEL_NAME, prompt="This should fail", max_tokens=1 ) - return_code = proc.wait(timeout=5) + return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT) assert return_code is not None diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 8045ab1468d6a..ee8dea4e949bc 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -7,6 +7,7 @@ import json import pytest from ...utils import RemoteOpenAIServer +from .conftest import add_attention_backend MISTRAL_FORMAT_ARGS = [ "--tokenizer_mode", @@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [ @pytest.mark.asyncio @pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"]) -async def test_basic_audio(mary_had_lamb, model_name): +async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention): server_args = ["--enforce-eager"] if model_name.startswith("mistralai"): server_args += MISTRAL_FORMAT_ARGS + add_attention_backend(server_args, rocm_aiter_fa_attention) + # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. with RemoteOpenAIServer(model_name, server_args) as remote_server: client = remote_server.get_async_client() @@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name): @pytest.mark.asyncio -async def test_basic_audio_with_lora(mary_had_lamb): +async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): """Ensure STT (transcribe) requests can pass LoRA through to generate.""" + # ROCm SPECIFIC CONFIGURATION: + # To ensure the test passes on ROCm, we modify the max model length to 512. + # We DO NOT apply this to other platforms to maintain strict upstream parity. + from vllm.platforms import current_platform + model_name = "ibm-granite/granite-speech-3.3-2b" lora_model_name = "speech" server_args = [ @@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb): "--lora-modules", f"{lora_model_name}={model_name}", "--max-model-len", - "2048", + "512" if current_platform.is_rocm() else "2048", "--max-num-seqs", "1", ] + add_attention_backend(server_args, rocm_aiter_fa_attention) + # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. with RemoteOpenAIServer(model_name, server_args) as remote_server: client = remote_server.get_async_client() @@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb): @pytest.mark.asyncio -async def test_basic_audio_gemma(foscolo): +async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention): # Gemma accuracy on some of the audio samples we use is particularly bad, # hence we use a different one here. WER is evaluated separately. model_name = "google/gemma-3n-E2B-it" server_args = ["--enforce-eager"] + add_attention_backend(server_args, rocm_aiter_fa_attention) + with RemoteOpenAIServer( model_name, server_args, max_wait_seconds=480 ) as remote_server: diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index 2c577237691ab..cae45872ee6a6 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -14,16 +14,26 @@ import pytest_asyncio import soundfile as sf from ...utils import RemoteOpenAIServer +from .conftest import add_attention_backend SERVER_ARGS = ["--enforce-eager"] +def _get_server_args(attention_config): + """Get server args with attention backend if specified.""" + args = SERVER_ARGS.copy() + add_attention_backend(args, attention_config) + return args + + @pytest.fixture( scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"] ) -def server(request): +def server(request, rocm_aiter_fa_attention): # Parametrize over model name - with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server: + with RemoteOpenAIServer( + request.param, _get_server_args(rocm_aiter_fa_attention) + ) as remote_server: yield remote_server, request.param @@ -35,10 +45,12 @@ async def client_and_model(server): @pytest.mark.asyncio -async def test_non_asr_model(foscolo): +async def test_non_asr_model(foscolo, rocm_aiter_fa_attention): # text to text model model_name = "JackFram/llama-68m" - with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server: + with RemoteOpenAIServer( + model_name, _get_server_args(rocm_aiter_fa_attention) + ) as remote_server: client = remote_server.get_async_client() res = await client.audio.translations.create( model=model_name, file=foscolo, temperature=0.0 @@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo): @pytest.mark.asyncio -async def test_basic_audio_with_lora(mary_had_lamb): +async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): """Ensure STT (translate) requests can pass LoRA through to generate.""" + # ROCm SPECIFIC CONFIGURATION: + # To ensure the test passes on ROCm, we modify the max model length to 512. + # We DO NOT apply this to other platforms to maintain strict upstream parity. + from vllm.platforms import current_platform + # NOTE - careful to call this test before the module scoped server # fixture, otherwise it'll OOMkill the CI model_name = "ibm-granite/granite-speech-3.3-2b" @@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb): "--lora-modules", f"{lora_model_name}={model_name}", "--max-model-len", - "2048", + "512" if current_platform.is_rocm() else "2048", "--max-num-seqs", "1", ] + add_attention_backend(server_args, rocm_aiter_fa_attention) + # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. with RemoteOpenAIServer(model_name, server_args) as remote_server: client = remote_server.get_async_client() diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index e6b8a7f895636..65bda9e8bc010 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -8,6 +8,7 @@ import pytest import pytest_asyncio from vllm.multimodal.utils import encode_video_url, fetch_video +from vllm.platforms import current_platform from ...utils import RemoteOpenAIServer @@ -37,7 +38,16 @@ def server(): json.dumps({"video": MAXIMUM_VIDEOS}), ] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + # ROCm: Increase timeouts to handle potential network delays and slower + # video processing when downloading multiple videos from external sources + env_overrides = {} + if current_platform.is_rocm(): + env_overrides = { + "VLLM_VIDEO_FETCH_TIMEOUT": "120", + "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300", + } + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server: yield remote_server @@ -287,6 +297,11 @@ async def test_chat_streaming_video( @pytest.mark.parametrize( "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))] ) +@pytest.mark.flaky( + reruns=2, + reruns_delay=5, + condition=current_platform.is_rocm(), +) async def test_multi_video_input( client: openai.AsyncOpenAI, model_name: str, video_urls: list[str] ): diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index bc3da6ef1554a..00823ff5f78ca 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -10,6 +10,7 @@ from transformers import AutoProcessor from vllm.multimodal.base import MediaWithBytes from vllm.multimodal.utils import encode_image_url, fetch_image +from vllm.platforms import current_platform from ...utils import RemoteOpenAIServer @@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [ ], ] +EXPECTED_MM_BEAM_SEARCH_RES_ROCM = [ + # MultiHeadAttention attn_backend: FLASH_ATTN + # with Triton Attention backend + [ + "The image shows a wooden boardwalk leading through a", + "The image shows a wooden boardwalk extending into a", + ], + [ + "The image shows two parrots perched on", + "The image shows two birds perched on a cur", + ], + [ + "The image shows a Venn diagram with three over", + "The image contains a Venn diagram with three over", + ], + [ + "This image displays a gradient of colors ranging from", + "This image displays a gradient of colors transitioning from", + ], +] + @pytest.fixture(scope="module") def server(): @@ -59,7 +81,16 @@ def server(): json.dumps({"image": MAXIMUM_IMAGES}), ] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + # ROCm: Increase timeouts to handle potential network delays and slower + # video processing when downloading multiple videos from external sources + env_overrides = {} + if current_platform.is_rocm(): + env_overrides = { + "VLLM_VIDEO_FETCH_TIMEOUT": "120", + "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300", + } + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server: yield remote_server @@ -288,9 +319,16 @@ async def test_single_chat_session_image_base64encoded_beamsearch( image_idx: int, url_encoded_image: dict[str, str], ): + # ROCm: Switch expected results based on platform + from vllm.platforms import current_platform + # NOTE: This test also validates that we pass MM data through beam search raw_image_url = TEST_IMAGE_ASSETS[image_idx] - expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] + + if current_platform.is_rocm(): + expected_res = EXPECTED_MM_BEAM_SEARCH_RES_ROCM[image_idx] + else: + expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url]) diff --git a/tests/entrypoints/openai/test_vision_embeds.py b/tests/entrypoints/openai/test_vision_embeds.py index 42d9fe4840bbe..067a00c6b9382 100644 --- a/tests/entrypoints/openai/test_vision_embeds.py +++ b/tests/entrypoints/openai/test_vision_embeds.py @@ -33,6 +33,7 @@ def _terratorch_dummy_messages(): ] +@pytest.mark.asyncio @pytest.mark.parametrize( "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] ) diff --git a/tests/entrypoints/pooling/basic/test_encode.py b/tests/entrypoints/pooling/basic/test_encode.py index f86ecef2e4744..ab3a0610c3e17 100644 --- a/tests/entrypoints/pooling/basic/test_encode.py +++ b/tests/entrypoints/pooling/basic/test_encode.py @@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "intfloat/multilingual-e5-small" PROMPTS = [ @@ -35,6 +30,12 @@ TOKEN_IDS = [ @pytest.fixture(scope="module") def llm(): + # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend + # that supports encoder-only models on ROCm. + attention_config = None + if current_platform.is_rocm(): + attention_config = {"backend": "FLEX_ATTENTION"} + # pytest caches the fixture so we use weakref.proxy to # enable garbage collection llm = LLM( @@ -44,6 +45,7 @@ def llm(): gpu_memory_utilization=0.75, enforce_eager=True, seed=0, + attention_config=attention_config, ) yield weakref.proxy(llm) diff --git a/tests/entrypoints/pooling/basic/test_truncation.py b/tests/entrypoints/pooling/basic/test_truncation.py index 0d2d385840402..5d099dd1f4391 100644 --- a/tests/entrypoints/pooling/basic/test_truncation.py +++ b/tests/entrypoints/pooling/basic/test_truncation.py @@ -9,11 +9,6 @@ import pytest_asyncio from tests.utils import RemoteOpenAIServer from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" max_model_len = 128 @@ -44,6 +39,10 @@ def server(): str(max_model_len), ] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/embed/conftest.py b/tests/entrypoints/pooling/embed/conftest.py new file mode 100644 index 0000000000000..002b85874049c --- /dev/null +++ b/tests/entrypoints/pooling/embed/conftest.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Pytest configuration for vLLM pooling embed tests.""" + +import warnings + +import torch + +from vllm.platforms import current_platform + + +def pytest_collection_modifyitems(config, items): + """Configure ROCm-specific settings based on collected tests.""" + if not current_platform.is_rocm(): + return + + # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 + # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_mem_efficient_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + warnings.warn( + "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp " + "to avoid HuggingFace Transformers accuracy issues", + UserWarning, + stacklevel=1, + ) diff --git a/tests/entrypoints/pooling/embed/test_correctness_mteb.py b/tests/entrypoints/pooling/embed/test_correctness_mteb.py index 64673534fd32a..8cdd3d3c858d5 100644 --- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py +++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py @@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.utils import RemoteOpenAIServer from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" MODEL_NAME = "intfloat/e5-small" @@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667 def server(): args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/embed/test_offline.py b/tests/entrypoints/pooling/embed/test_offline.py index 12b47b1a08a8b..ea46b7401f6d7 100644 --- a/tests/entrypoints/pooling/embed/test_offline.py +++ b/tests/entrypoints/pooling/embed/test_offline.py @@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "intfloat/multilingual-e5-small" prompts = ["The chef prepared a delicious meal."] @@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."] @pytest.fixture(scope="module") def llm(): + # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend + # that supports encoder-only models on ROCm. + attention_config = None + if current_platform.is_rocm(): + attention_config = {"backend": "FLEX_ATTENTION"} + # pytest caches the fixture so we use weakref.proxy to # enable garbage collection llm = LLM( @@ -32,6 +33,7 @@ def llm(): gpu_memory_utilization=0.75, enforce_eager=True, seed=0, + attention_config=attention_config, ) yield weakref.proxy(llm) diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py index f96338c47f0be..f5e563daeaa03 100644 --- a/tests/entrypoints/pooling/embed/test_online.py +++ b/tests/entrypoints/pooling/embed/test_online.py @@ -28,16 +28,20 @@ from vllm.utils.serial_utils import ( decode_pooling_output, ) -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "intfloat/multilingual-e5-small" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 DTYPE = "bfloat16" +if current_platform.is_rocm(): + # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 + # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_mem_efficient_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + + @pytest.fixture(scope="module") def server(): args = [ @@ -53,6 +57,10 @@ def server(): DUMMY_CHAT_TEMPLATE, ] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/embed/test_online_dimensions.py b/tests/entrypoints/pooling/embed/test_online_dimensions.py index 26aa57742b02a..0545b8a0ae2fc 100644 --- a/tests/entrypoints/pooling/embed/test_online_dimensions.py +++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py @@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODELS = [ EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), EmbedModelInfo( @@ -62,6 +57,10 @@ def server(model_info, dtype: str): ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}'] ) + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(model_info.name, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/embed/test_online_long_text.py b/tests/entrypoints/pooling/embed/test_online_long_text.py index 0be7eebc2017d..316a8526404c0 100644 --- a/tests/entrypoints/pooling/embed/test_online_long_text.py +++ b/tests/entrypoints/pooling/embed/test_online_long_text.py @@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - def _generate_random_text(word_count: int) -> str: """Generate random text with approximately the specified word count.""" @@ -228,6 +223,10 @@ def server_with_chunked_processing(): "0.8", ] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/score/test_correctness_mteb.py b/tests/entrypoints/pooling/score/test_correctness_mteb.py index 81ad0097187b0..71e75b93504ac 100644 --- a/tests/entrypoints/pooling/score/test_correctness_mteb.py +++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py @@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.utils import RemoteOpenAIServer from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2" @@ -30,6 +25,10 @@ st_main_score = 0.33457 def server(): args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/score/test_offline.py b/tests/entrypoints/pooling/score/test_offline.py index ce36d61cb8476..c02c02cf234a6 100644 --- a/tests/entrypoints/pooling/score/test_offline.py +++ b/tests/entrypoints/pooling/score/test_offline.py @@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" @pytest.fixture(scope="module") def llm(): + # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend + # that supports encoder-only models on ROCm. + attention_config = None + if current_platform.is_rocm(): + attention_config = {"backend": "FLEX_ATTENTION"} + # pytest caches the fixture so we use weakref.proxy to # enable garbage collection llm = LLM( @@ -30,6 +31,7 @@ def llm(): gpu_memory_utilization=0.75, enforce_eager=True, seed=0, + attention_config=attention_config, ) yield weakref.proxy(llm) diff --git a/tests/entrypoints/pooling/score/test_online_rerank.py b/tests/entrypoints/pooling/score/test_online_rerank.py index f262dd4cb06b6..7f2af611d2e43 100644 --- a/tests/entrypoints/pooling/score/test_online_rerank.py +++ b/tests/entrypoints/pooling/score/test_online_rerank.py @@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse from vllm.entrypoints.pooling.score.protocol import RerankResponse from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" @@ -24,6 +19,10 @@ DTYPE = "bfloat16" def server(): args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server diff --git a/tests/entrypoints/pooling/score/test_online_score.py b/tests/entrypoints/pooling/score/test_online_score.py index 30ef55c8b6756..6c08027ee50b7 100644 --- a/tests/entrypoints/pooling/score/test_online_score.py +++ b/tests/entrypoints/pooling/score/test_online_score.py @@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.score.protocol import ScoreResponse from vllm.platforms import current_platform -if current_platform.is_rocm(): - pytest.skip( - "Encoder self-attention is not implemented on ROCm.", allow_module_level=True - ) - MODELS = [ {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True}, {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False}, @@ -44,6 +39,10 @@ def model(request): def server(model: dict[str, Any]): args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] + # ROCm: Use Flex Attention to support encoder-only self-attention. + if current_platform.is_rocm(): + args.extend(["--attention-backend", "FLEX_ATTENTION"]) + with RemoteOpenAIServer(model["name"], args) as remote_server: yield remote_server diff --git a/tests/standalone_tests/pytorch_nightly_dependency.sh b/tests/standalone_tests/pytorch_nightly_dependency.sh index fd93ad76bed0f..92820b269f9df 100644 --- a/tests/standalone_tests/pytorch_nightly_dependency.sh +++ b/tests/standalone_tests/pytorch_nightly_dependency.sh @@ -4,6 +4,11 @@ set -e set -x +if command -v rocminfo >/dev/null 2>&1; then + echo "Skipping test for ROCm platform" + exit 0 +fi + cd /vllm-workspace/ rm -rf .venv @@ -36,7 +41,7 @@ if diff before.txt after.txt; then echo "torch version not overridden." else echo "torch version overridden by nightly_torch_test.txt, \ - if the dependency is not triggered by the pytroch nightly test,\ + if the dependency is not triggered by the pytorch nightly test,\ please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py" exit 1 fi diff --git a/vllm/entrypoints/pooling/embed/conftest.py b/vllm/entrypoints/pooling/embed/conftest.py new file mode 100644 index 0000000000000..002b85874049c --- /dev/null +++ b/vllm/entrypoints/pooling/embed/conftest.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Pytest configuration for vLLM pooling embed tests.""" + +import warnings + +import torch + +from vllm.platforms import current_platform + + +def pytest_collection_modifyitems(config, items): + """Configure ROCm-specific settings based on collected tests.""" + if not current_platform.is_rocm(): + return + + # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 + # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_mem_efficient_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + warnings.warn( + "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp " + "to avoid HuggingFace Transformers accuracy issues", + UserWarning, + stacklevel=1, + )