From 64862d106efa78032702f5fa5c110ccd6d654e9a Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Date: Tue, 4 Feb 2025 19:58:22 -0800 Subject: [PATCH] [ROCM][AMD][TRITON] Halving warps number for fw_prefill to reduce spilling (#12713) Signed-off-by: Aleksandr Malyshev Co-authored-by: Aleksandr Malyshev --- vllm/attention/ops/prefix_prefill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index fbb6757ee3043..5fca1639363e0 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -11,7 +11,7 @@ from vllm.platforms import current_platform # Static kernels parameters BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64 -NUM_WARPS = 8 +NUM_WARPS = 4 if current_platform.is_rocm() else 8 # To check compatibility IS_TURING = current_platform.get_device_capability() == (7, 5)