diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index c68115729c425..41cb7084057dd 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -428,6 +428,14 @@ class Qwen2_5_VisionAttention(nn.Module): ) elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. + from vllm.platforms import current_platform + + # Never remove the next contiguous logic + # Without it, hallucinations occur with the backend + if current_platform.is_rocm(): + q = q.contiguous() + k = k.contiguous() + v = v.contiguous() outputs = [] for i in range(1, len(cu_seqlens)): start_idx = cu_seqlens[i - 1]