diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b1bc727e1e8e..987a24496d75 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -12,6 +12,7 @@ import numpy as np import torch import torch.distributed import torch.nn as nn +from tqdm import tqdm import vllm.envs as envs from vllm.attention import AttentionType, get_attn_backend @@ -2034,7 +2035,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): # can reuse the memory pool allocated for the large shapes. with graph_capture(device=self.device): skip_attn = not self.vllm_config.compilation_config.full_cuda_graph - for num_tokens in reversed(self.cudagraph_batch_sizes): + for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes), + desc="Capturing CUDA graphs", + total=len(self.cudagraph_batch_sizes)): for _ in range(self.vllm_config.compilation_config. cudagraph_num_of_warmups): self._dummy_run(num_tokens, skip_attn=skip_attn)