mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 20:55:35 +08:00
Add a 1-line docstring to explain why calling context_attention_fwd twice in test_prefix_prefill.py (#2553)
This commit is contained in:
parent
63e835cbcc
commit
7a0b011dd5
@ -125,6 +125,7 @@ def test_contexted_kv_attention(
|
|||||||
v_cache = v_cache.view(-1, block_size, num_heads,
|
v_cache = v_cache.view(-1, block_size, num_heads,
|
||||||
head_size).permute(0, 2, 3, 1).contiguous()
|
head_size).permute(0, 2, 3, 1).contiguous()
|
||||||
|
|
||||||
|
# Warm up the Triton kernel by calling it once before actually measuring generation time
|
||||||
context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
|
context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
|
||||||
b_start_loc, b_seq_len, b_ctx_len, max_input_len)
|
b_start_loc, b_seq_len, b_ctx_len, max_input_len)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user