From 1a2f8fb828f0444705db319786b2e901159f184e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 9 Dec 2024 13:47:24 -0800 Subject: [PATCH] [v1] fix use compile sizes (#11000) Signed-off-by: youkaichao --- vllm/config.py | 1 + vllm/v1/worker/gpu_model_runner.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index 29f0839dcabb..5fb9563fcf3a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2522,6 +2522,7 @@ class VllmConfig: self.compilation_config.custom_ops = ["none"] self.compilation_config.use_cudagraph = True self.compilation_config.use_inductor = True + self.compilation_config.cudagraph_num_of_warmups = 1 self.compilation_config.pass_config.enable_fusion = False self.compilation_config.pass_config.enable_reshape = False self.compilation_config.level = CompilationLevel.PIECEWISE diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7f95be06188e..c601aca13fea 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -582,6 +582,9 @@ class GPUModelRunner: # can reuse the memory pool allocated for the large shapes. with graph_capture(): for num_tokens in reversed(self.cudagraph_batch_sizes): + for _ in range(self.vllm_config.compilation_config. + cudagraph_num_of_warmups): + self._dummy_run(self.model, num_tokens, self.kv_caches) self._dummy_run(self.model, num_tokens, self.kv_caches) end_time = time.perf_counter()