mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 07:14:59 +08:00
[BugFix] Make FlashInferMetadataBuilder non-blocking (#25040)
Signed-off-by: Julien Lin <jullin@nvidia.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
parent
48ecb4438b
commit
b1a63d1b3b
@ -585,9 +585,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
kv_data_type=self.kv_cache_dtype,
|
kv_data_type=self.kv_cache_dtype,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
|
attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
|
||||||
|
self.device, non_blocking=True)
|
||||||
attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
|
attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to(
|
||||||
self.device)
|
self.device, non_blocking=True)
|
||||||
|
|
||||||
if num_decodes > 0:
|
if num_decodes > 0:
|
||||||
pure_decode = num_prefills == 0
|
pure_decode = num_prefills == 0
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user