diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 8041448ed09a..80381270f419 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -66,7 +66,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.platforms import _Backend, current_platform +from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope from vllm.utils import is_list_of @@ -335,14 +335,6 @@ class Qwen3_VisionTransformer(nn.Module): }: raise RuntimeError( f"Qwen3-VL does not support {self.attn_backend} backend now.") - if current_platform.is_device_capability( - 100) and self.attn_backend != _Backend.TORCH_SDPA: - # TODO(Roger/Wentao): remove this after FA - # or XFORMERS's issue fixed on Blackwell - logger.info_once("Qwen3-VL vision attention does not support " - f"{self.attn_backend} backend on Blackwell now. " - "Vision attention backend is set to TORCH_SDPA.") - self.attn_backend = _Backend.TORCH_SDPA self.blocks = nn.ModuleList([ Qwen3_VisionBlock( diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 58ba08101bc9..8b9f9f569206 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -205,6 +205,12 @@ class CudaPlatformBase(Platform): @classmethod def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> _Backend: + + # For Blackwell GPUs, force TORCH_SDPA for now. + # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501 + if cls.has_device_capability(100): + return _Backend.TORCH_SDPA + if dtype not in (torch.float16, torch.bfloat16): return _Backend.XFORMERS