From f3137cdd81cae3a48282c22130fbcadcfc64ea95 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 23 Jul 2025 20:20:14 -0400 Subject: [PATCH] [Core] Freeze gc during cuda graph capture to speed up init (#21146) Signed-off-by: Codex Signed-off-by: mgoin --- vllm/envs.py | 7 +++++++ vllm/v1/worker/gpu_model_runner.py | 17 ++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 16f635b3ac41d..ca45d69eec1b1 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -140,6 +140,7 @@ if TYPE_CHECKING: VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120 VLLM_USE_CUDNN_PREFILL: bool = False + VLLM_ENABLE_CUDAGRAPH_GC: bool = False VLLM_LOOPBACK_IP: str = "" @@ -968,6 +969,12 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_TRTLLM_DECODE_ATTENTION": lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None), + # Controls garbage collection during CUDA graph capture. + # If set to 0 (default), enables GC freezing to speed up capture time. + # If set to 1, allows GC to run during capture. + "VLLM_ENABLE_CUDAGRAPH_GC": + lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))), + # Used to force set up loopback IP "VLLM_LOOPBACK_IP": lambda: os.getenv("VLLM_LOOPBACK_IP", ""), diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3671b46607037..a5bf197ba1615 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2439,10 +2439,25 @@ class GPUModelRunner(LoRAModelRunnerMixin): start_time = time.perf_counter() start_free_gpu_memory = torch.cuda.mem_get_info()[0] + @contextmanager + def freeze_gc(): + # Optimize garbage collection during CUDA graph capture. + # Clean up, then freeze all remaining objects from being included + # in future collections. + gc.collect() + should_freeze = not envs.VLLM_ENABLE_CUDAGRAPH_GC + if should_freeze: + gc.freeze() + try: + yield + finally: + if should_freeze: + gc.unfreeze() + # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. - with graph_capture(device=self.device): + with freeze_gc(), graph_capture(device=self.device): full_cg = self.full_cuda_graph # Only rank 0 should print progress bar during capture compilation_cases = reversed(self.cudagraph_batch_sizes)