mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 07:14:59 +08:00
[Misc] Fix seq_lens for graph capture (#23175)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
2c3f557f08
commit
40f26734b9
@ -2317,15 +2317,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
|
|
||||||
# If force_attention is True, we always capture attention. Otherwise,
|
# If force_attention is True, we always capture attention. Otherwise,
|
||||||
# it only happens for cudagraph_runtime_mode=FULL.
|
# it only happens for cudagraph_runtime_mode=FULL.
|
||||||
if force_attention or cudagraph_runtime_mode == \
|
if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
|
||||||
CUDAGraphMode.FULL:
|
|
||||||
attn_metadata = {}
|
attn_metadata = {}
|
||||||
|
|
||||||
# Make sure max_model_len is used at the graph capture time.
|
# Make sure max_model_len is used at the graph capture time.
|
||||||
self.seq_lens_np[:num_reqs] = self.max_model_len
|
self.seq_lens_np[:num_reqs] = self.max_model_len
|
||||||
self.seq_lens_np[num_reqs:] = 0
|
self.seq_lens_np[num_reqs:] = 0
|
||||||
self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
|
self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True)
|
||||||
non_blocking=True)
|
|
||||||
|
|
||||||
for kv_cache_group_id, kv_cache_group_spec in enumerate(
|
for kv_cache_group_id, kv_cache_group_spec in enumerate(
|
||||||
self.kv_cache_config.kv_cache_groups):
|
self.kv_cache_config.kv_cache_groups):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user