From 21d5daa4aca6e16c0c42dbfdf704fdfd0006ba4c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 18 Dec 2023 18:16:17 -0800 Subject: [PATCH] Add warning on CUDA graph memory usage (#2182) --- vllm/worker/model_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 276ef0708847..5623d27df3a3 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -395,6 +395,9 @@ class ModelRunner: "unexpected consequences if the model is not static. To " "run the model in eager mode, set 'enforce_eager=True' or " "use '--enforce-eager' in the CLI.") + logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. " + "If you are running out of memory, consider decreasing " + "`gpu_memory_utilization` or enforcing eager mode.") start_time = time.perf_counter() # Prepare dummy inputs. These will be reused for all batch sizes.