mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-11 03:54:28 +08:00
Use the optimized block sizes after tuning the kernel. (#14329)
This commit is contained in:
parent
f7a6bd0fa1
commit
1e3598edeb
@ -12,8 +12,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
||||
from vllm.attention.backends.utils import CommonAttentionState
|
||||
|
||||
# These are the 2 tunable parameters of the paged attention Pallas kernel.
|
||||
NUM_QUERIES_PER_BLOCK = 32
|
||||
NUM_KV_PAGES_PER_BLOCK = 128
|
||||
NUM_QUERIES_PER_BLOCK = 16
|
||||
NUM_KV_PAGES_PER_BLOCK = 256
|
||||
|
||||
|
||||
class PallasAttentionBackend(AttentionBackend):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user