[UX] Add Feedback During CUDAGraph Capture (#19501)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
This commit is contained in:
Robert Shaw 2025-06-11 14:09:05 -07:00 committed by GitHub
parent c7ea0b56cd
commit 97a9465bbc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -12,6 +12,7 @@ import numpy as np
import torch import torch
import torch.distributed import torch.distributed
import torch.nn as nn import torch.nn as nn
from tqdm import tqdm
import vllm.envs as envs import vllm.envs as envs
from vllm.attention import AttentionType, get_attn_backend from vllm.attention import AttentionType, get_attn_backend
@ -2034,7 +2035,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# can reuse the memory pool allocated for the large shapes. # can reuse the memory pool allocated for the large shapes.
with graph_capture(device=self.device): with graph_capture(device=self.device):
skip_attn = not self.vllm_config.compilation_config.full_cuda_graph skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
for num_tokens in reversed(self.cudagraph_batch_sizes): for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes),
desc="Capturing CUDA graphs",
total=len(self.cudagraph_batch_sizes)):
for _ in range(self.vllm_config.compilation_config. for _ in range(self.vllm_config.compilation_config.
cudagraph_num_of_warmups): cudagraph_num_of_warmups):
self._dummy_run(num_tokens, skip_attn=skip_attn) self._dummy_run(num_tokens, skip_attn=skip_attn)