From 19748806f04e6e390b42c2318a5ca76ffb6c1368 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Tue, 21 Oct 2025 18:39:09 -0400 Subject: [PATCH] [Bugfix] skip cuda graph for drafter when running with eager (#26821) Signed-off-by: Benjamin Chislett --- vllm/v1/worker/gpu_model_runner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7c2cb701fd64c..b2d99a0ec69bc 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3482,7 +3482,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - use_cudagraphs = cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + use_cudagraphs = ( + cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + and not self.speculative_config.enforce_eager + ) self.drafter.dummy_run(num_tokens, use_cudagraphs=use_cudagraphs) # This is necessary to avoid blocking DP.