From 1e3598edeb694c1a6a1fdfac5e7d12697749561b Mon Sep 17 00:00:00 2001 From: iefgnoix Date: Fri, 7 Mar 2025 05:25:13 -0800 Subject: [PATCH] Use the optimized block sizes after tuning the kernel. (#14329) --- vllm/v1/attention/backends/pallas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index bbbdf50ac0cc7..bf3992281a735 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -12,8 +12,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, from vllm.attention.backends.utils import CommonAttentionState # These are the 2 tunable parameters of the paged attention Pallas kernel. -NUM_QUERIES_PER_BLOCK = 32 -NUM_KV_PAGES_PER_BLOCK = 128 +NUM_QUERIES_PER_BLOCK = 16 +NUM_KV_PAGES_PER_BLOCK = 256 class PallasAttentionBackend(AttentionBackend):