From 7a0b011dd51e5c6b48e8f8f5424be0995b5cb8ee Mon Sep 17 00:00:00 2001 From: Jason Zhu Date: Mon, 22 Jan 2024 14:47:25 -0800 Subject: [PATCH] Add a 1-line docstring to explain why calling context_attention_fwd twice in test_prefix_prefill.py (#2553) --- tests/kernels/test_prefix_prefill.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 8fa6358d3ec7..0531b05135fb 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -125,6 +125,7 @@ def test_contexted_kv_attention( v_cache = v_cache.view(-1, block_size, num_heads, head_size).permute(0, 2, 3, 1).contiguous() + # Warm up the Triton kernel by calling it once before actually measuring generation time context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table, b_start_loc, b_seq_len, b_ctx_len, max_input_len) torch.cuda.synchronize()