diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 5710aa1930b79..160bf2307fbf5 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -563,6 +563,7 @@ class FlashAttentionImpl(AttentionImpl): softmax_scale=self.scale, causal=True, alibi_slopes=self.alibi_slopes, + softcap=self.logits_soft_cap, ).squeeze(1) # Reshape the output tensor.