[Perf] Convert np array to torch tensor to index into block table for attn chunking (#24474)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-12-10 05:34:57 +08:00 · 2025-09-09 20:01:06 -07:00 · 2025-09-09 20:01:06 -07:00 · dc625ea6b8
commit dc625ea6b8
parent b23fb78623
1 changed files with 8 additions and 1 deletions
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@ -542,7 +542,14 @@ def make_local_attention_virtual_batches(
                                                   1)
    batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32),
                              local_blocks * pages_per_local_batch)
-    block_table_local = block_table[batch_indices, block_indices]\
+
+    # NOTE: https://github.com/pytorch/pytorch/pull/160256 causes performance
+    # regression when using numpy arrays (batch and block indices) to index into
+    # torch tensor (block_table). As a workaround, convert numpy arrays to torch
+    # tensor first, which recovers perf.
+    batch_indices_torch = torch.from_numpy(batch_indices)
+    block_indices_torch = torch.from_numpy(block_indices)
+    block_table_local = block_table[batch_indices_torch, block_indices_torch]\
        .view(virtual_batches, -1)

    query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local)