From 9f1710f1ace3535920c0bb6d4cc329c36289080e Mon Sep 17 00:00:00 2001 From: Ying Zhong Date: Fri, 7 Mar 2025 01:35:49 +0800 Subject: [PATCH] Fix mla prefill context performance (#13897) Signed-off-by: ZhongYingMatrix --- vllm/attention/backends/mla/common.py | 2 +- vllm/v1/attention/backends/mla/common.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 8184b073275c6..109e8496fc31e 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1308,7 +1308,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): ) kv_c_normed = workspace[:toks]\ - [..., :self.kv_lora_rank].unsqueeze(1) + [..., :self.kv_lora_rank] k_pe = workspace[:toks]\ [..., self.kv_lora_rank:].unsqueeze(1) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index c98262eea1e91..0b55854de94af 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -874,7 +874,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): ) kv_c_normed = workspace[:toks]\ - [..., :self.kv_lora_rank].unsqueeze(1) + [..., :self.kv_lora_rank] k_pe = workspace[:toks]\ [..., self.kv_lora_rank:].unsqueeze(1)