From dc625ea6b8f8072afd3e0923f7ef5a39be87fc96 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Tue, 9 Sep 2025 20:01:06 -0700 Subject: [PATCH] [Perf] Convert np array to torch tensor to index into block table for attn chunking (#24474) Signed-off-by: Yong Hoon Shin --- vllm/v1/attention/backends/utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index b286a4ba9fe5..8e3d530fc1f9 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -542,7 +542,14 @@ def make_local_attention_virtual_batches( 1) batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32), local_blocks * pages_per_local_batch) - block_table_local = block_table[batch_indices, block_indices]\ + + # NOTE: https://github.com/pytorch/pytorch/pull/160256 causes performance + # regression when using numpy arrays (batch and block indices) to index into + # torch tensor (block_table). As a workaround, convert numpy arrays to torch + # tensor first, which recovers perf. + batch_indices_torch = torch.from_numpy(batch_indices) + block_indices_torch = torch.from_numpy(block_indices) + block_table_local = block_table[batch_indices_torch, block_indices_torch]\ .view(virtual_batches, -1) query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local)