[Attention] Update tests to remove deprecated env vars (#30563)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
This commit is contained in:
Matthew Bonanni 2025-12-17 12:49:59 -05:00 committed by GitHub
parent 9ca8cb38fd
commit 7eb6cb6c18
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
34 changed files with 580 additions and 447 deletions

View File

@ -39,7 +39,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine

View File

@ -67,7 +67,6 @@ def _fix_prompt_embed_outputs(
@pytest.mark.parametrize("model_executor", ["uni", "mp"])
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
def test_models(
monkeypatch: pytest.MonkeyPatch,
hf_runner,
model: str,
backend: str,
@ -77,48 +76,46 @@ def test_models(
model_executor: str,
enable_prompt_embeds: bool,
) -> None:
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = (
"The following numbers of the sequence "
+ ", ".join(str(i) for i in range(1024))
+ " are:"
)
example_prompts = [prompt]
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = (
"The following numbers of the sequence "
+ ", ".join(str(i) for i in range(1024))
+ " are:"
)
example_prompts = [prompt]
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if enable_prompt_embeds:
with torch.no_grad():
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if enable_prompt_embeds:
with torch.no_grad():
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
with VllmRunner(
model,
max_model_len=8192,
enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7,
async_scheduling=async_scheduling,
distributed_executor_backend=model_executor,
attention_config={"backend": backend},
) as vllm_model:
if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs, hf_model, example_prompts
)
else:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(
model,
max_model_len=8192,
enforce_eager=enforce_eager,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7,
async_scheduling=async_scheduling,
distributed_executor_backend=model_executor,
) as vllm_model:
if enable_prompt_embeds:
vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
vllm_outputs = _fix_prompt_embed_outputs(
vllm_outputs, hf_model, example_prompts
)
else:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@multi_gpu_test(num_gpus=2)
@ -161,12 +158,6 @@ def test_models_distributed(
): # noqa
pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
if attention_backend:
monkeypatch_context.setenv(
"VLLM_ATTENTION_BACKEND",
attention_backend,
)
for k, v in extra_env.items():
monkeypatch_context.setenv(k, v)
@ -178,6 +169,7 @@ def test_models_distributed(
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method
# (the default method).
attention_config = {"backend": attention_backend} if attention_backend else None
with vllm_runner(
model,
dtype=dtype,
@ -185,6 +177,7 @@ def test_models_distributed(
distributed_executor_backend=distributed_executor_backend,
enable_prompt_embeds=enable_prompt_embeds,
gpu_memory_utilization=0.7,
attention_config=attention_config,
) as vllm_model:
if enable_prompt_embeds:
with hf_runner(model, dtype=dtype) as hf_model:

View File

@ -208,7 +208,8 @@ def test_attn_quant(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig(
# Testing properties
@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig(
# Testing properties
@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp(
# To capture subprocess logs, we need to know whether spawn or fork is used.
# Force spawn as it is more general.
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs["attention_config"] = {"backend": backend.name}
compilation_config = CompilationConfig(
# Testing properties

View File

@ -89,7 +89,6 @@ class TestSetting:
],
)
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting,
):
# this test is run under multiple suits, with different GPUs.
@ -107,49 +106,48 @@ def test_compile_correctness(
f"{cuda_device_count_stateless()}"
)
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
final_args = [
*model_args,
"-pp",
str(pp_size),
"-tp",
str(tp_size),
"-cc.cudagraph_mode=none",
]
final_args = [
*model_args,
"-pp",
str(pp_size),
"-tp",
str(tp_size),
"-cc.cudagraph_mode=none",
f"--attention-backend={attn_backend}",
]
all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = []
all_args: list[list[str]] = []
all_envs: list[dict[str, str] | None] = []
for comp_mode in [
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
for mode in [CompilationMode.NONE, comp_mode]:
all_args.append(
final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
)
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings(
model,
all_args,
all_envs,
method=method if method != "generate" else "generate_close",
for comp_mode in [
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
for mode in [CompilationMode.NONE, comp_mode]:
all_args.append(
final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
)
all_envs.clear()
all_args.clear()
for mode in [
CompilationMode.NONE,
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
all_envs.append({})
all_envs.append({})
# inductor will change the output, so we only compare if the output
# is close, not exactly the same.
compare_all_settings(
model,
all_args,
all_envs,
method=method if method != "generate" else "generate_close",
)
all_envs.clear()
all_args.clear()
compare_all_settings(model, all_args * 3, all_envs, method=method)
for mode in [
CompilationMode.NONE,
CompilationMode.STOCK_TORCH_COMPILE,
CompilationMode.DYNAMO_TRACE_ONCE,
CompilationMode.VLLM_COMPILE,
]:
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
all_envs.append({})
all_envs.append({})
compare_all_settings(model, all_args * 3, all_envs, method=method)

View File

@ -74,7 +74,6 @@ def llm_pair(request):
# Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1.
"VLLM_USE_FLASHINFER_SAMPLER": "0",
**backend_config.env_vars,
}
with temporary_environ(env_vars):
full = LLM(
@ -170,16 +169,10 @@ class TestFullCUDAGraph:
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
def test_full_cudagraph_with_invalid_backend():
with (
temporary_environ(
{
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
# Flex_Attention is not supported with full cuda graph
}
),
pytest.raises(RuntimeError),
):
# Flex_Attention is not supported with full cuda graph
with pytest.raises(RuntimeError):
LLM(
model="Qwen/Qwen2-1.5B-Instruct",
compilation_config=CompilationConfig(cudagraph_mode="FULL"),
attention_config={"backend": "FLEX_ATTENTION"},
)

View File

@ -197,20 +197,19 @@ def test_custom_compile_config(
],
)
def test_fp8_kv_scale_compile(
monkeypatch: pytest.MonkeyPatch,
compilation_mode: int,
model: str,
backend: AttentionBackendEnum | None,
):
if backend:
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
model_kwargs = {
"quantization": "fp8",
"kv_cache_dtype": "fp8_e4m3",
"calculate_kv_scales": True,
"max_model_len": 512,
}
if backend:
model_kwargs["attention_config"] = {"backend": backend.name}
run_model(compilation_mode, model, **model_kwargs)

View File

@ -219,14 +219,12 @@ def _test_cp_gsm8k(
]
)
server_env = {}
if attn_backend:
server_env["VLLM_ATTENTION_BACKEND"] = attn_backend
server_args.append(f"--attention-backend={attn_backend}")
with RemoteOpenAIServer(
model_id,
server_args,
env_dict=server_env,
max_wait_seconds=720,
) as remote_server:
host = f"http://{remote_server.host}"

View File

@ -20,23 +20,21 @@ from ..utils import compare_two_settings, create_new_process_for_each_test
)
@create_new_process_for_each_test()
def test_pp_cudagraph(
monkeypatch: pytest.MonkeyPatch,
PP_SIZE: int,
MODEL_NAME: str,
ATTN_BACKEND: LiteralString,
):
with monkeypatch.context() as m:
cudagraph_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--pipeline-parallel-size",
str(PP_SIZE),
"--distributed-executor-backend",
"mp",
]
m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
cudagraph_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--pipeline-parallel-size",
str(PP_SIZE),
"--distributed-executor-backend",
"mp",
f"--attention-backend={ATTN_BACKEND}",
]
eager_args = cudagraph_args + ["--enforce-eager"]
eager_args = cudagraph_args + ["--enforce-eager"]
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)

View File

@ -9,7 +9,7 @@ from typing import Annotated, Literal
import pytest
from vllm.config import CompilationConfig, config
from vllm.config import AttentionConfig, CompilationConfig, config
from vllm.engine.arg_utils import (
EngineArgs,
contains_type,
@ -298,6 +298,139 @@ def test_compilation_config():
)
def test_attention_config():
from vllm.attention.backends.registry import AttentionBackendEnum
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
# default value
args = parser.parse_args([])
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config == AttentionConfig()
# set backend via dot notation
args = parser.parse_args(["--attention-config.backend", "FLASH_ATTN"])
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config.backend is not None
assert engine_args.attention_config.backend.name == "FLASH_ATTN"
# set backend via --attention-backend shorthand
args = parser.parse_args(["--attention-backend", "FLASHINFER"])
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_backend is not None
assert engine_args.attention_backend == "FLASHINFER"
# set all fields via dot notation
args = parser.parse_args(
[
"--attention-config.backend",
"FLASH_ATTN",
"--attention-config.flash_attn_version",
"3",
"--attention-config.use_prefill_decode_attention",
"true",
"--attention-config.flash_attn_max_num_splits_for_cuda_graph",
"16",
"--attention-config.use_cudnn_prefill",
"true",
"--attention-config.use_trtllm_ragged_deepseek_prefill",
"true",
"--attention-config.use_trtllm_attention",
"true",
"--attention-config.disable_flashinfer_prefill",
"true",
"--attention-config.disable_flashinfer_q_quantization",
"true",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config.backend is not None
assert engine_args.attention_config.backend.name == "FLASH_ATTN"
assert engine_args.attention_config.flash_attn_version == 3
assert engine_args.attention_config.use_prefill_decode_attention is True
assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 16
assert engine_args.attention_config.use_cudnn_prefill is True
assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is True
assert engine_args.attention_config.use_trtllm_attention is True
assert engine_args.attention_config.disable_flashinfer_prefill is True
assert engine_args.attention_config.disable_flashinfer_q_quantization is True
# set to string form of a dict with all fields
args = parser.parse_args(
[
"--attention-config="
'{"backend": "FLASHINFER", "flash_attn_version": 2, '
'"use_prefill_decode_attention": false, '
'"flash_attn_max_num_splits_for_cuda_graph": 8, '
'"use_cudnn_prefill": false, '
'"use_trtllm_ragged_deepseek_prefill": false, '
'"use_trtllm_attention": false, '
'"disable_flashinfer_prefill": false, '
'"disable_flashinfer_q_quantization": false}',
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
assert engine_args.attention_config.backend is not None
assert engine_args.attention_config.backend.name == "FLASHINFER"
assert engine_args.attention_config.flash_attn_version == 2
assert engine_args.attention_config.use_prefill_decode_attention is False
assert engine_args.attention_config.flash_attn_max_num_splits_for_cuda_graph == 8
assert engine_args.attention_config.use_cudnn_prefill is False
assert engine_args.attention_config.use_trtllm_ragged_deepseek_prefill is False
assert engine_args.attention_config.use_trtllm_attention is False
assert engine_args.attention_config.disable_flashinfer_prefill is False
assert engine_args.attention_config.disable_flashinfer_q_quantization is False
# test --attention-backend flows into VllmConfig.attention_config
args = parser.parse_args(
[
"--model",
"facebook/opt-125m",
"--attention-backend",
"FLASH_ATTN",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
vllm_config = engine_args.create_engine_config()
assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASH_ATTN
# test --attention-config.backend flows into VllmConfig.attention_config
args = parser.parse_args(
[
"--model",
"facebook/opt-125m",
"--attention-config.backend",
"FLASHINFER",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
vllm_config = engine_args.create_engine_config()
assert vllm_config.attention_config.backend == AttentionBackendEnum.FLASHINFER
# test --attention-backend and --attention-config.backend are mutually exclusive
args = parser.parse_args(
[
"--model",
"facebook/opt-125m",
"--attention-backend",
"FLASH_ATTN",
"--attention-config.backend",
"FLASHINFER",
]
)
assert args is not None
engine_args = EngineArgs.from_cli_args(args)
with pytest.raises(ValueError, match="mutually exclusive"):
engine_args.create_engine_config()
def test_prefix_cache_default():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
args = parser.parse_args([])

View File

@ -76,15 +76,10 @@ def default_server_args(with_tool_parser: bool):
@pytest.fixture(scope="module")
def gptoss_server(
monkeypatch_module: pytest.MonkeyPatch, default_server_args: list[str]
):
with monkeypatch_module.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
with RemoteOpenAIServer(
GPT_OSS_MODEL_NAME, default_server_args
) as remote_server:
yield remote_server
def gptoss_server(default_server_args: list[str]):
server_args = default_server_args + ["--attention-backend=TRITON_ATTN"]
with RemoteOpenAIServer(GPT_OSS_MODEL_NAME, server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture

View File

@ -6,7 +6,9 @@ from unittest.mock import patch
import pytest
import torch
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform
@ -73,18 +75,18 @@ def generate_params():
@pytest.mark.parametrize("device, name, use_mla, block_size", generate_params())
def test_env(
def test_backend_selection(
device: str,
name: str,
use_mla: bool,
block_size: int,
monkeypatch: pytest.MonkeyPatch,
):
"""Test attention backend selection with valid device-backend pairs."""
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", name)
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
# Create AttentionConfig with the specified backend
attention_config = AttentionConfig(backend=AttentionBackendEnum[name])
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
if device == "cpu":
with patch("vllm.platforms.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float16, None, block_size)
@ -217,27 +219,32 @@ def test_env(
@pytest.mark.parametrize("device", ["cpu", "cuda"])
def test_fp32_fallback(device: str):
"""Test attention backend selection with fp32."""
if device == "cpu":
with patch("vllm.platforms.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "CPU_ATTN"
# Use default config (no backend specified)
vllm_config = VllmConfig()
elif device == "cuda":
with patch("vllm.platforms.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
with set_current_vllm_config(vllm_config):
if device == "cpu":
with patch("vllm.platforms.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "CPU_ATTN"
elif device == "cuda":
with patch("vllm.platforms.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation."""
pytest.skip(
"Skipping as current backend selector does not "
"handle fallbacks when a backend is set via env var."
"handle fallbacks when a backend is explicitly set."
)
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASH_ATTN)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
# Unsupported CUDA arch
monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
backend = get_attn_backend(16, torch.float16, None, 16)
@ -277,15 +284,10 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
assert backend.get_name() != "FLASH_ATTN"
def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
def test_invalid_backend():
"""Test that invalid attention backend names raise ValueError."""
with (
monkeypatch.context() as m,
patch("vllm.platforms.current_platform", CudaPlatform()),
pytest.raises(ValueError),
):
m.setenv("VLLM_ATTENTION_BACKEND", "INVALID")
# Should raise ValueError for invalid backend
with pytest.raises(ValueError) as exc_info:
get_attn_backend(32, torch.float16, None, 16)
assert "Invalid value 'INVALID'" in str(exc_info.value)
# Invalid backend name should raise ValueError when creating enum
AttentionConfig(backend=AttentionBackendEnum["INVALID"])

View File

@ -4,7 +4,9 @@
import pytest
import torch
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
from vllm.platforms.rocm import RocmPlatform
@ -16,40 +18,56 @@ def clear_cache():
@pytest.mark.skip(reason="Skipped for now. Should be revisited.")
def test_selector(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN")
# Set the current platform to ROCm using monkeypatch
monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
# Set the current platform to ROCm using monkeypatch
monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
# Test standard ROCm attention
attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_ATTN)
vllm_config = VllmConfig(attention_config=attention_config)
# Test standard ROCm attention
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert backend.get_name() == "ROCM_FLASH" or backend.get_name() == "TRITON_ATTN"
# MLA test for deepseek related
# MLA test for deepseek related
# Change the attention backend to triton MLA
attention_config = AttentionConfig(backend=AttentionBackendEnum.TRITON_MLA)
vllm_config = VllmConfig(attention_config=attention_config)
# change the attention backend to triton MLA
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA")
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
assert backend.get_name() == "TRITON_MLA"
# If attention backend is None
# If use_mla is true
# The selected backend is triton MLA
m.setenv("VLLM_ATTENTION_BACKEND", "")
# If attention backend is None
# If use_mla is true
# The selected backend is triton MLA
attention_config = AttentionConfig(backend=None)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
assert backend.get_name() == "TRITON_MLA"
# change the attention backend to AITER MLA
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA")
# Change the attention backend to AITER MLA
attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_AITER_MLA)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
assert backend.get_name() == "ROCM_AITER_MLA"
# If attention backend is None
# If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA
m.setenv("VLLM_ATTENTION_BACKEND", "")
# If attention backend is None
# If use_mla is true
# If VLLM_ROCM_USE_AITER is enabled
# The selected backend is ROCM_AITER_MLA
with monkeypatch.context() as m:
m.setenv("VLLM_ROCM_USE_AITER", "1")
backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
assert backend.get_name() == "ROCM_AITER_MLA"
attention_config = AttentionConfig(backend=None)
vllm_config = VllmConfig(attention_config=attention_config)
with set_current_vllm_config(vllm_config):
backend = get_attn_backend(
576, torch.bfloat16, "auto", 1, False, use_mla=True
)
assert backend.get_name() == "ROCM_AITER_MLA"

View File

@ -37,7 +37,7 @@ def set_seed(seed):
not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
reason="CUDA not available or PyTorch version < 2.7",
)
def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
def test_flex_attention_vs_default_backend(vllm_runner):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
@ -54,35 +54,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
) as llm_flex:
output_flex = llm_flex.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
attention_config={"backend": "FLEX_ATTENTION"},
) as llm_flex:
output_flex = llm_flex.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
# Run with default backend
with monkeypatch.context() as m:
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
gpu_memory_utilization=0.85,
) as llm_default:
output_default = llm_default.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
gpu_memory_utilization=0.85,
) as llm_default:
output_default = llm_default.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=output_flex,
@ -96,7 +93,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
reason="CUDA not available or PyTorch version < 2.7",
)
def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
def test_encoder_flex_attention_vs_default_backend(vllm_runner):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
@ -110,30 +107,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
with vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_flex:
flex_outputs = llm_flex.embed(prompts)
with vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
attention_config={"backend": "FLEX_ATTENTION"},
) as llm_flex:
flex_outputs = llm_flex.embed(prompts)
# Run with default backend
with (
monkeypatch.context() as m,
vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_default,
):
with vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_default:
default_outputs = llm_default.embed(prompts)
check_embeddings_close(

View File

@ -35,10 +35,12 @@ audio_lora_path = MODEL_NAME
models = [MODEL_NAME]
@pytest.fixture(autouse=True)
def set_attention_backend_for_rocm(monkeypatch):
@pytest.fixture
def granite_speech_attention_config():
"""Return attention config for Granite Speech tests on ROCm."""
if current_platform.is_rocm():
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
return {"backend": "TRITON_ATTN"}
return None
def run_test(
@ -53,6 +55,7 @@ def run_test(
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
attention_config: dict | None = None,
):
"""Inference result should be the same between hf and vllm.
@ -80,6 +83,7 @@ def run_test(
enable_lora=True,
max_lora_rank=64,
enforce_eager=True,
attention_config=attention_config,
) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path)
vllm_outputs_per_case = [
@ -131,6 +135,7 @@ def test_models(
vllm_runner,
model: str,
audio_assets: AudioTestAssets,
granite_speech_attention_config,
dtype: str,
max_model_len: int,
max_tokens: int,
@ -157,4 +162,5 @@ def test_models(
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
attention_config=granite_speech_attention_config,
)

View File

@ -2,23 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import os
import warnings
import pytest
from vllm.platforms import current_platform
def pytest_collection_modifyitems(config, items):
"""Set FLEX_ATTENTION backend for SigLIP tests on ROCm."""
if not current_platform.is_rocm():
return
@pytest.fixture
def siglip_attention_config():
"""Return attention config for SigLIP tests on ROCm.
siglip_tests = [item for item in items if "test_siglip" in item.nodeid]
if siglip_tests:
os.environ["VLLM_ATTENTION_BACKEND"] = "FLEX_ATTENTION"
warnings.warn(
"ROCm: Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION for SigLIP tests",
UserWarning,
stacklevel=1,
)
On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if current_platform.is_rocm():
return {"backend": "FLEX_ATTENTION"}
return None

View File

@ -38,6 +38,7 @@ def _run_test(
*,
dtype: str,
tokenization_kwargs: dict[str, Any] | None = None,
attention_config: dict[str, Any] | None = None,
) -> None:
if tokenization_kwargs is None:
tokenization_kwargs = {}
@ -49,6 +50,7 @@ def _run_test(
enforce_eager=True,
max_model_len=64,
gpu_memory_utilization=0.7,
attention_config=attention_config,
) as vllm_model:
vllm_outputs = vllm_model.embed(
input_texts, images=input_images, tokenization_kwargs=tokenization_kwargs
@ -90,6 +92,7 @@ def test_models_text(
hf_runner,
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
@ -108,6 +111,7 @@ def test_models_text(
"padding": "max_length",
"max_length": 64,
}, # siglip2 was trained with this padding setting.
attention_config=siglip_attention_config,
)
@ -117,6 +121,7 @@ def test_models_image(
hf_runner,
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
@ -133,6 +138,7 @@ def test_models_image(
input_images,
model,
dtype=dtype,
attention_config=siglip_attention_config,
)
@ -141,6 +147,7 @@ def test_models_image(
def test_models_text_image_no_crash(
vllm_runner,
image_assets,
siglip_attention_config,
model: str,
dtype: str,
) -> None:
@ -154,6 +161,7 @@ def test_models_text_image_no_crash(
enforce_eager=True,
max_model_len=64,
gpu_memory_utilization=0.7,
attention_config=siglip_attention_config,
) as vllm_model:
with pytest.raises(ValueError, match="not both"):
vllm_model.embed(texts, images=images)

View File

@ -75,7 +75,6 @@ def test_models(
with monkeypatch.context() as m:
m.setenv("TOKENIZERS_PARALLELISM", "true")
m.setenv("VLLM_ATTENTION_BACKEND", backend)
MAX_MODEL_LEN = 1024
NUM_LOG_PROBS = 8
@ -86,6 +85,7 @@ def test_models(
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype="auto",
attention_config={"backend": backend},
) as vllm_model:
baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS
@ -97,6 +97,7 @@ def test_models(
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
attention_config={"backend": backend},
) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS

View File

@ -108,11 +108,12 @@ def can_initialize(
patch.object(V1EngineCore, "_initialize_kv_caches", _initialize_kv_caches_v1),
monkeypatch.context() as m,
):
if model_arch == "GptOssForCausalLM":
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
attention_config = (
{"backend": "TRITON_ATTN"} if model_arch == "GptOssForCausalLM" else None
)
if model_arch == "WhisperForConditionalGeneration":
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
@ -143,6 +144,7 @@ def can_initialize(
else "vllm",
hf_overrides=hf_overrides_fn,
max_num_seqs=model_info.max_num_seqs,
attention_config=attention_config,
)

View File

@ -94,26 +94,20 @@ def mock_on_gfx9():
None,
AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
),
# Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
(
{"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
None,
AttentionBackendEnum.ROCM_ATTN.get_path(),
),
# Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
# Test Case 9: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
(
{"VLLM_ROCM_USE_AITER": "1"},
"TRITON_ATTN",
AttentionBackendEnum.TRITON_ATTN.get_path(),
),
# Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
# Test Case 10: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
# (explicitly disabled)
(
{"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
None,
AttentionBackendEnum.TRITON_ATTN.get_path(),
),
# Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
# Test Case 11: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
(
{"VLLM_ROCM_USE_AITER": "1"},
"ROCM_ATTN",

View File

@ -249,8 +249,8 @@ def create_dummy_kv_cache(
@dataclass
class BackendConfig:
name: str
env_vars: dict
comp_config: dict # compilation config
attention_config: dict
comp_config: dict
specific_gpu_arch: tuple | None = None
@ -259,10 +259,10 @@ full_cg_backend_configs = {
# FA3 on Hopper
"FA3": BackendConfig(
name="FA3",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_FLASH_ATTN_VERSION": "3",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
attention_config={
"backend": "FLASH_ATTN",
"flash_attn_version": 3,
"flash_attn_max_num_splits_for_cuda_graph": 16,
},
comp_config={
"cudagraph_mode": "FULL",
@ -272,9 +272,7 @@ full_cg_backend_configs = {
# FlashMLA on Hopper
"FlashMLA": BackendConfig(
name="FlashMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASHMLA",
},
attention_config={"backend": "FLASHMLA"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
@ -283,9 +281,7 @@ full_cg_backend_configs = {
# Cutlass MLA on Blackwell
"CutlassMLA": BackendConfig(
name="CutlassMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
},
attention_config={"backend": "CUTLASS_MLA"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
@ -294,9 +290,7 @@ full_cg_backend_configs = {
# FlashInfer MLA on Blackwell
"FlashInferMLA": BackendConfig(
name="FlashInferMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASHINFER_MLA",
},
attention_config={"backend": "FLASHINFER_MLA"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
@ -305,9 +299,9 @@ full_cg_backend_configs = {
# FlashAttention MLA on Hopper
"FlashAttentionMLA": BackendConfig(
name="FlashAttentionMLA",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
attention_config={
"backend": "FLASH_ATTN_MLA",
"flash_attn_max_num_splits_for_cuda_graph": 16,
},
comp_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
@ -317,10 +311,10 @@ full_cg_backend_configs = {
# FA2
"FA2": BackendConfig(
name="FA2",
env_vars={
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
"VLLM_FLASH_ATTN_VERSION": "2",
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "16",
attention_config={
"backend": "FLASH_ATTN",
"flash_attn_version": 2,
"flash_attn_max_num_splits_for_cuda_graph": 16,
},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
@ -329,7 +323,7 @@ full_cg_backend_configs = {
# Triton Attention
"TritonAttn": BackendConfig(
name="TritonAttn",
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
attention_config={"backend": "TRITON_ATTN"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
@ -337,14 +331,17 @@ full_cg_backend_configs = {
# FlashInfer
"FlashInfer": BackendConfig(
name="FlashInfer",
env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
attention_config={"backend": "FLASHINFER"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
),
"RocmAttn": BackendConfig(
name="RocmAttn",
env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
attention_config={
"backend": "ROCM_ATTN",
"use_prefill_decode_attention": True,
},
comp_config={
"cudagraph_mode": "FULL",
},

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
import os
import weakref
from contextlib import ExitStack
@ -13,26 +11,6 @@ from vllm import LLM
from vllm.config import CompilationConfig, CompilationMode
from vllm.platforms import current_platform
@contextlib.contextmanager
def temporary_environ(env_vars):
"""
Temporarily set environment variables and restore them afterward.
We have to do this vs monkeypatch because monkeypatch doesn't work
with "module" scoped fixtures.
"""
original_env = {k: os.environ.get(k) for k in env_vars}
try:
os.environ.update(env_vars)
yield
finally:
for k, v in original_env.items():
if v is None:
os.environ.pop(k, None)
else:
os.environ[k] = v
# test attention backend and cudagraph_mode combo
# (backend_name, cudagraph_mode, supported)
if current_platform.is_rocm():
@ -68,9 +46,9 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
):
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
env_vars = backend_configs[backend_name].env_vars
attention_config = backend_config.attention_config
with temporary_environ(env_vars), ExitStack() as stack:
with ExitStack() as stack:
if not supported:
stack.enter_context(pytest.raises(Exception))
@ -80,6 +58,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
trust_remote_code=True,
gpu_memory_utilization=0.45,
max_model_len=1024,
attention_config=attention_config,
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, cudagraph_mode=cudagraph_mode
),
@ -122,9 +101,10 @@ combo_cases_2 = [
def test_cudagraph_compilation_combo(
backend_name, cudagraph_mode, compilation_mode, supported
):
env_vars = backend_configs[backend_name].env_vars
backend_config = backend_configs[backend_name]
attention_config = backend_config.attention_config
with temporary_environ(env_vars), ExitStack() as stack:
with ExitStack() as stack:
if not supported:
stack.enter_context(pytest.raises(Exception))
@ -134,6 +114,7 @@ def test_cudagraph_compilation_combo(
trust_remote_code=True,
gpu_memory_utilization=0.45,
max_model_len=1024,
attention_config=attention_config,
compilation_config=CompilationConfig(
mode=compilation_mode, cudagraph_mode=cudagraph_mode
),

View File

@ -28,7 +28,7 @@ IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90()
BACKENDS,
)
def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
backend, monkeypatch: pytest.MonkeyPatch
backend,
):
"""
Ensures that the same request (the 'needle' prompt) yields identical output
@ -54,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
random.seed(seed)
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
attention_config = {"backend": backend}
# Allow overrides from environment (useful for CI tuning)
# "facebook/opt-125m" is too small, doesn't reliably test determinism
model = resolve_model_name(backend)
@ -92,6 +92,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len,
attention_config=attention_config,
)
# Baseline generation for the needle prompt alone.
@ -106,6 +107,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len,
attention_config=attention_config,
)
mismatches = 0
@ -163,10 +165,8 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
BACKENDS,
)
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
backend, monkeypatch: pytest.MonkeyPatch
backend,
):
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
random.seed(seed)
model_name = resolve_model_name(backend)
@ -193,6 +193,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
dtype="bfloat16", # not everything is supported
gpu_memory_utilization=0.9,
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
attention_config={"backend": backend},
)
# Use more realistic prompts for better token generation
@ -381,12 +382,11 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
"backend",
BACKENDS,
)
def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
def test_simple_generation(backend):
"""
Simple test that runs the model with a basic prompt and prints the output.
Useful for quick smoke testing and debugging.
"""
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
model = resolve_model_name(backend)
llm = LLM(
@ -398,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
dtype="bfloat16",
enable_prefix_caching=False,
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
attention_config={"backend": backend},
)
prompt = "the capital of france is"
@ -444,8 +445,6 @@ def test_logprobs_without_batch_invariance_should_fail(
The test will PASS if we detect differences (proving batch invariance matters).
The test will FAIL if everything matches (suggesting batch invariance isn't needed).
"""
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
# CRITICAL: Disable batch invariance for this test
monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
@ -465,6 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail(
max_model_len=8192,
dtype="bfloat16",
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
attention_config={"backend": backend},
)
# build ragged prompts to change shapes significantly across BS=1 vs BS=N
@ -649,7 +649,7 @@ def test_logprobs_without_batch_invariance_should_fail(
@skip_unsupported
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
def test_decode_logprobs_match_prefill_logprobs(
backend, monkeypatch: pytest.MonkeyPatch
backend,
):
"""
Test that verifies decode logprobs match prefill logprobs.
@ -664,8 +664,6 @@ def test_decode_logprobs_match_prefill_logprobs(
This ensures that the logprobs from decode are consistent with what
we would get if we ran prefill on each prefix.
"""
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
random.seed(seed)
model_name = resolve_model_name(backend)
@ -689,6 +687,7 @@ def test_decode_logprobs_match_prefill_logprobs(
max_model_len=8192,
dtype="bfloat16",
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
attention_config={"backend": backend},
)
# Use a few test prompts
@ -920,6 +919,7 @@ def LLM_with_max_seqs(
max_num_seqs: int,
gpu_memory_utilization: float,
max_model_len: int,
attention_config: dict | None = None,
) -> LLM:
"""
Helper to construct an LLM with a specific max_num_seqs (batch-size limit)
@ -934,6 +934,7 @@ def LLM_with_max_seqs(
tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
enable_prefix_caching=False,
enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90,
attention_config=attention_config,
# Enable for MOE models
# enable_expert_parallel=True,
)

View File

@ -136,11 +136,9 @@ def _compare_bs1_vs_bsn_single_process(
@skip_unsupported
@pytest.mark.parametrize("backend", BACKENDS)
def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
backend: str, monkeypatch: pytest.MonkeyPatch
backend: str,
) -> None:
random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
# Override backend for this test (and the RemoteOpenAIServer child process).
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
model_name = resolve_model_name(backend)
prompts_all = [_random_prompt(10, 50) for _ in range(32)]
@ -156,6 +154,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
server_args: list[str] = [
"--max-model-len=8192",
"--max-num-seqs=32",
f"--attention-backend={backend}",
]
if tp_size:
server_args += ["-tp", tp_size]

View File

@ -142,16 +142,17 @@ def run_tests(
"""Test consistency of combos of async scheduling, preemption,
uni/multiproc executor with spec decoding."""
with monkeypatch.context() as m:
# avoid precision errors
if current_platform.is_rocm():
if is_testing_with_spec_decoding:
# Use TRITON_ATTN for spec decoding test for consistency
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
else:
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
# Determine attention config based on platform
if current_platform.is_rocm():
if is_testing_with_spec_decoding:
# Use TRITON_ATTN for spec decoding test for consistency
attention_config = {"backend": "TRITON_ATTN"}
else:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
attention_config = {"backend": "ROCM_AITER_FA"}
else:
attention_config = {"backend": "FLEX_ATTENTION"}
with monkeypatch.context() as m:
# lock matmul precision to full FP32 (IEEE)
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
# m.setenv("VLLM_BATCH_INVARIANT", "1")
@ -174,6 +175,7 @@ def run_tests(
spec_config,
test_prefill_chunking=test_prefill_chunking,
is_testing_with_spec_decoding=is_testing_with_spec_decoding,
attention_config=attention_config,
)
outputs.append(test_results)
@ -262,6 +264,7 @@ def run_test(
spec_config: dict[str, Any] | None,
test_prefill_chunking: bool,
is_testing_with_spec_decoding: bool = False,
attention_config: dict[str, Any] | None = None,
):
spec_decoding = spec_config is not None
cache_arg: dict[str, Any] = (
@ -301,6 +304,7 @@ def run_test(
dtype=dtype,
speculative_config=spec_config,
disable_log_stats=False,
attention_config=attention_config,
**cache_arg,
) as vllm_model:
results = []

View File

@ -10,7 +10,7 @@ from ...utils import create_new_process_for_each_test
@create_new_process_for_each_test()
@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
def test_cascade_attention(example_system_message, attn_backend):
prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
if attn_backend == "FLASHINFER":
@ -19,19 +19,18 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
"needs investigation. See issue #25679."
)
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
llm = LLM(
model="Qwen/Qwen2-1.5B-Instruct", attention_config={"backend": attn_backend}
)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
# No cascade attention.
single_prompt = [example_system_message + prompt]
responses = llm.generate(single_prompt, sampling_params)
ref_output = responses[0].outputs[0].text
# No cascade attention.
single_prompt = [example_system_message + prompt]
responses = llm.generate(single_prompt, sampling_params)
ref_output = responses[0].outputs[0].text
# (Probably) Use cascade attention.
prompts = [example_system_message + prompt] * 64
responses = llm.generate(prompts, sampling_params)
for response in responses:
assert response.outputs[0].text == ref_output
# (Probably) Use cascade attention.
prompts = [example_system_message + prompt] * 64
responses = llm.generate(prompts, sampling_params)
for response in responses:
assert response.outputs[0].text == ref_output

View File

@ -438,25 +438,26 @@ def test_eagle_correctness(
should be the same when using eagle speculative decoding.
model_setup: (method, model_name, eagle_model_name, tp_size)
"""
# Determine attention config
# Scout requires default backend selection because vision encoder has
# head_dim 88 being incompatible with FLASH_ATTN and needs to fall back
# to Flex Attn
if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
if current_platform.is_rocm():
# TODO: Enable Flex Attn for spec_decode on ROCm
pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
attention_config = None # Let it fall back to default
else:
attention_config = {"backend": attn_backend}
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
pytest.skip(
"TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform"
)
with monkeypatch.context() as m:
if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
# Scout requires default backend selection
# because vision encoder has head_dim 88 being incompatible
# with FLASH_ATTN and needs to fall back to Flex Attn
# pass if not ROCm
if current_platform.is_rocm():
# TODO: Enable Flex Attn for spec_decode on ROCm
pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
else:
m.setenv("VLLM_MLA_DISABLE", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
pytest.skip(
"TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform"
)
m.setenv("VLLM_MLA_DISABLE", "1")
if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
if "deepseek" in model_setup[1].lower():
@ -471,7 +472,10 @@ def test_eagle_correctness(
max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
ref_llm = LLM(
model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
model=model_name,
max_model_len=max_model_len,
tensor_parallel_size=tp_size,
attention_config=attention_config,
)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
@ -492,6 +496,7 @@ def test_eagle_correctness(
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
model_impl=model_impl,
attention_config=attention_config,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0

View File

@ -3,21 +3,29 @@ set -xe
# Parse command line arguments
KV_BUFFER_DEVICE="cuda" # Default to cuda
ATTENTION_BACKEND="" # Default to empty (use vllm default)
while [[ $# -gt 0 ]]; do
case $1 in
--kv_buffer_device)
KV_BUFFER_DEVICE="$2"
shift 2
;;
--attention-backend)
ATTENTION_BACKEND="$2"
shift 2
;;
*)
echo "Unknown option $1"
echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
echo "Usage: $0 [--kv_buffer_device <cuda|cpu>] [--attention-backend <backend>]"
exit 1
;;
esac
done
echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
if [[ -n "$ATTENTION_BACKEND" ]]; then
echo "Using attention backend: $ATTENTION_BACKEND"
fi
DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD
if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then
@ -148,6 +156,11 @@ run_tests_for_model() {
--tensor-parallel-size $PREFILLER_TP_SIZE \
--kv-transfer-config '$KV_CONFIG'"
# Add attention backend config if specified
if [[ -n "$ATTENTION_BACKEND" ]]; then
BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
fi
if [ -n "$model_args" ]; then
FULL_CMD="$BASE_CMD $model_args"
else
@ -188,7 +201,12 @@ run_tests_for_model() {
--block-size ${DECODE_BLOCK_SIZE} \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--kv-transfer-config '$KV_CONFIG'"
# Add attention backend config if specified
if [[ -n "$ATTENTION_BACKEND" ]]; then
BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
fi
# DP-EP attention mode
if [[ -z "$DP_EP" ]]; then
BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE"

View File

@ -15,14 +15,14 @@ configs=(
run_tests() {
local label=$1
local extra_env=$2
local extra_args=$2
echo "=== Running tests (${label}) ==="
for cfg in "${configs[@]}"; do
echo "-> Running with ${cfg} ${extra_env:+and ${extra_env}}"
echo "-> Running with ${cfg} ${extra_args:+and ${extra_args}}"
# Use 'env' to safely set variables without eval
if ! env ${extra_env} ${cfg} bash "${SCRIPT}"; then
echo "❌ Test failed for config: ${cfg} ${extra_env:+(${extra_env})}"
if ! env ${cfg} bash "${SCRIPT}" ${extra_args}; then
echo "❌ Test failed for config: ${cfg} ${extra_args:+(${extra_args})}"
exit 1
fi
done
@ -34,8 +34,8 @@ run_tests "default backend" ""
# Check if FLASHINFER is set (non-empty)
if [[ -n "${FLASHINFER:-}" ]]; then
echo "FLASHINFER is set, rerunning with VLLM_ATTENTION_BACKEND=FLASHINFER"
run_tests "FLASHINFER backend" "VLLM_ATTENTION_BACKEND=FLASHINFER"
echo "FLASHINFER is set, rerunning with --attention-backend FLASHINFER"
run_tests "FLASHINFER backend" "--attention-backend FLASHINFER"
else
echo "FLASHINFER not set, skipping FLASHINFER runs."
fi

View File

@ -1132,7 +1132,7 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
"TRITON_ATTN",
],
)
def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
def test_register_kv_caches(dist_init, attn_backend):
"""
Test that register_kv_caches() properly calls nixl_wrapper methods with
correct data.
@ -1144,9 +1144,7 @@ def test_register_kv_caches(dist_init, attn_backend, monkeypatch):
block layout info
"""
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
vllm_config = create_vllm_config()
vllm_config = create_vllm_config(attention_backend=attn_backend)
# Import the appropriate backend based on the parameter
if attn_backend == "FLASH_ATTN":

View File

@ -11,6 +11,7 @@ import torch
from vllm import SamplingParams
from vllm.config import (
AttentionConfig,
CacheConfig,
DeviceConfig,
KVTransferConfig,
@ -94,6 +95,7 @@ def create_vllm_config(
dtype: str = "float16",
cache_dtype: str = "auto",
hf_overrides: dict[str, Any] | None = None,
attention_backend: str | None = None,
) -> VllmConfig:
"""Initialize VllmConfig For Testing."""
model_config = ModelConfig(
@ -124,12 +126,14 @@ def create_vllm_config(
enable_permute_local_kv=enable_permute_local_kv,
kv_connector_extra_config=kv_connector_extra_config or {},
)
attention_config = AttentionConfig(backend=attention_backend)
return VllmConfig(
scheduler_config=scheduler_config,
model_config=model_config,
cache_config=cache_config,
kv_transfer_config=kv_transfer_config,
device_config=DeviceConfig("cpu"),
attention_config=attention_config,
)

View File

@ -13,7 +13,6 @@ from vllm import LLM, SamplingParams, TokensPrompt
from vllm.config import KVEventsConfig, KVTransferConfig
from vllm.distributed.kv_events import BlockStored, KVEventBatch
from vllm.platforms import current_platform
from vllm.utils.system_utils import set_env_var
CPU_BLOCK_SIZES = [48]
ATTN_BACKENDS = ["FLASH_ATTN"]
@ -180,13 +179,13 @@ def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
topic="test",
)
with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend):
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
gpu_memory_utilization=0.5,
kv_events_config=kv_events_config,
kv_transfer_config=kv_transfer_config,
)
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
gpu_memory_utilization=0.5,
kv_events_config=kv_events_config,
kv_transfer_config=kv_transfer_config,
attention_config={"backend": attn_backend},
)
events_endpoint = events_endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)

View File

@ -15,6 +15,7 @@ from tests.v1.attention.utils import (
)
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import (
AttentionConfig,
CacheConfig,
DeviceConfig,
ModelConfig,
@ -38,6 +39,7 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
def _create_proposer(
method: str,
num_speculative_tokens: int,
attention_backend: str | None = None,
speculative_token_tree: list[tuple[int, ...]] | None = None,
) -> EagleProposer:
model_config = ModelConfig(model=model_dir, runner="generate", max_model_len=100)
@ -70,6 +72,7 @@ def _create_proposer(
max_model_len=model_config.max_model_len,
is_encoder_decoder=model_config.is_encoder_decoder,
),
attention_config=AttentionConfig(backend=attention_backend),
)
return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
@ -331,8 +334,6 @@ def test_load_model(
use_distinct_lm_head,
monkeypatch,
):
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
pytest.skip(
"TRITON_ATTN does not support "
@ -394,7 +395,9 @@ def test_load_model(
assert not isinstance(target_model, SupportsMultiModal)
# Create proposer using the helper function
proposer = _create_proposer(method, num_speculative_tokens=8)
proposer = _create_proposer(
method, num_speculative_tokens=8, attention_backend=attn_backend
)
# Call the method under test
proposer.load_model(target_model)
@ -420,8 +423,6 @@ def test_load_model(
@pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
pytest.skip(
"TRITON_ATTN does not support "
@ -449,7 +450,9 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
seq_lens = [seq_len_1, seq_len_2]
# Create proposer first so we can use its actual hidden_size
proposer = _create_proposer("eagle", num_speculative_tokens)
proposer = _create_proposer(
"eagle", num_speculative_tokens, attention_backend=attn_backend
)
# Get the hidden_size from the proposer to ensure consistency
hidden_size = proposer.hidden_size
@ -622,7 +625,9 @@ def test_propose_tree(spec_token_tree):
# Create proposer first so we can use its actual hidden_size.
proposer = _create_proposer(
"eagle", num_speculative_tokens, speculative_token_tree=spec_token_tree
"eagle",
num_speculative_tokens,
speculative_token_tree=spec_token_tree,
)
# Get the hidden_size from the proposer to ensure consistency.
hidden_size = proposer.hidden_size

View File

@ -38,53 +38,48 @@ def test_ngram_max_len(num_speculative_tokens: int):
def test_eagle_max_len(
monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
):
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
pytest.skip(
"TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform"
)
if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
m.setenv("VLLM_ROCM_USE_AITER", "1")
llm = LLM(
model="meta-llama/Meta-Llama-3-8B-Instruct",
enforce_eager=True, # For faster initialization.
speculative_config={
"method": "eagle",
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"num_speculative_tokens": num_speculative_tokens,
"max_model_len": 80,
},
max_model_len=200,
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
pytest.skip(
"TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform"
)
sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
outputs = llm.generate(_PROMPTS, sampling_params)
for o in outputs:
assert o.outputs[0].finish_reason == "length", (
"This test is only meaningful if the output "
"is truncated due to max length"
)
sampling_params = SamplingParams(
max_tokens=200,
structured_outputs=StructuredOutputsParams(
regex="^" + "a b c d e " * 15 + "$"
),
if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
llm = LLM(
model="meta-llama/Meta-Llama-3-8B-Instruct",
enforce_eager=True, # For faster initialization.
speculative_config={
"method": "eagle",
"model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"num_speculative_tokens": num_speculative_tokens,
"max_model_len": 80,
},
max_model_len=200,
attention_config={"backend": attn_backend},
)
sampling_params = SamplingParams(max_tokens=200, ignore_eos=True)
outputs = llm.generate(_PROMPTS, sampling_params)
for o in outputs:
assert o.outputs[0].finish_reason == "length", (
"This test is only meaningful if the output is truncated due to max length"
)
output = llm.generate(_PROMPTS, sampling_params)
for o in output:
assert o.prompt_token_ids is not None
assert (
len(o.prompt_token_ids)
< 80
< len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
<= 200
), (
"This test is only meaningful if the output "
"is longer than the eagle max length"
)
assert o.outputs[0].text == "a b c d e " * 15
sampling_params = SamplingParams(
max_tokens=200,
structured_outputs=StructuredOutputsParams(regex="^" + "a b c d e " * 15 + "$"),
)
output = llm.generate(_PROMPTS, sampling_params)
for o in output:
assert o.prompt_token_ids is not None
assert (
len(o.prompt_token_ids)
< 80
< len(o.prompt_token_ids) + len(o.outputs[0].token_ids)
<= 200
), (
"This test is only meaningful if the output "
"is longer than the eagle max length"
)
assert o.outputs[0].text == "a b c d e " * 15

View File

@ -165,7 +165,7 @@ class RocmAttentionBackend(AttentionBackend):
raise ValueError(
f"Head size {head_size} is not supported by {attn_type}. "
f"Supported head sizes are: {cls.get_supported_head_sizes()}. "
"Set --attention-config.backend=FLEX_ATTENTION to use "
"Set --attention-backend=FLEX_ATTENTION to use "
"FlexAttention backend which supports all head sizes."
)