diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 505ebec34d2f..a9a624b85abc 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -177,7 +177,7 @@ class ipex_ops: out: torch.Tensor, seqlen_q: torch.Tensor, seqlen_k: torch.Tensor, - alibi_slopes: torch.Tensor, + alibi_slopes: Optional[torch.Tensor], max_seqlen_q: int, max_seqlen_k: int, pdropout: float, @@ -193,6 +193,8 @@ class ipex_ops: if ipex.__version__.endswith("cpu"): if logits_soft_cap != 0.0: raise ValueError("IPEX CPU does not support logits_soft_cap") + assert alibi_slopes is None + assert window_size_left < 0 and window_size_right < 0 ipex.llm.functional.varlen_attention(query.contiguous(), key.contiguous(), value.contiguous(), out, diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py index e2d16908fa9a..528df2e98679 100644 --- a/vllm/attention/backends/cpu_mla.py +++ b/vllm/attention/backends/cpu_mla.py @@ -273,6 +273,9 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]): return_softmax=False, gen_=None, logits_soft_cap=0.0, + window_size_left=-1, + window_size_right=-1, + alibi_slopes=None, ) # remove padding