[BugFix] Make FlashInferMetadataBuilder non-blocking (#25040)

Signed-off-by: Julien Lin <jullin@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
nvjullin 2025-09-20 04:36:34 +08:00 committed by GitHub
parent 48ecb4438b
commit b1a63d1b3b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -585,9 +585,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
kv_data_type=self.kv_cache_dtype,
)
else:
attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
self.device, non_blocking=True)
attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
self.device)
self.device, non_blocking=True)
if num_decodes > 0:
pure_decode = num_prefills == 0