diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 549c5dd2bbb2..897c3a621320 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2885,6 +2885,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): finally: if should_freeze: gc.unfreeze() + gc.collect() # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes