mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 08:04:58 +08:00
[Misc] Fix seq_lens for graph capture (#23175)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
2c3f557f08
commit
40f26734b9
@ -2317,15 +2317,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
|
||||
# If force_attention is True, we always capture attention. Otherwise,
|
||||
# it only happens for cudagraph_runtime_mode=FULL.
|
||||
if force_attention or cudagraph_runtime_mode == \
|
||||
CUDAGraphMode.FULL:
|
||||
if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
|
||||
attn_metadata = {}
|
||||
|
||||
# Make sure max_model_len is used at the graph capture time.
|
||||
self.seq_lens_np[:num_reqs] = self.max_model_len
|
||||
self.seq_lens_np[num_reqs:] = 0
|
||||
self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
|
||||
non_blocking=True)
|
||||
self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True)
|
||||
|
||||
for kv_cache_group_id, kv_cache_group_spec in enumerate(
|
||||
self.kv_cache_config.kv_cache_groups):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user