mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 11:06:15 +08:00
[UX] Add Feedback During CUDAGraph Capture (#19501)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
This commit is contained in:
parent
c7ea0b56cd
commit
97a9465bbc
@ -12,6 +12,7 @@ import numpy as np
|
||||
import torch
|
||||
import torch.distributed
|
||||
import torch.nn as nn
|
||||
from tqdm import tqdm
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.attention import AttentionType, get_attn_backend
|
||||
@ -2034,7 +2035,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
# can reuse the memory pool allocated for the large shapes.
|
||||
with graph_capture(device=self.device):
|
||||
skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
|
||||
for num_tokens in reversed(self.cudagraph_batch_sizes):
|
||||
for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
|
||||
desc="Capturing CUDA graphs",
|
||||
total=len(self.cudagraph_batch_sizes)):
|
||||
for _ in range(self.vllm_config.compilation_config.
|
||||
cudagraph_num_of_warmups):
|
||||
self._dummy_run(num_tokens, skip_attn=skip_attn)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user