[Bugfix] skip cuda graph for drafter when running with eager (#26821)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-12-24 20:45:33 +08:00 · 2025-10-21 18:39:09 -04:00 · 2025-10-21 18:39:09 -04:00 · 19748806f0
commit 19748806f0
parent 4a8a567e16
1 changed files with 4 additions and 1 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -3482,7 +3482,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):

            if self.speculative_config and self.speculative_config.use_eagle():
                assert isinstance(self.drafter, EagleProposer)
-                use_cudagraphs = cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+                use_cudagraphs = (
+                    cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+                    and not self.speculative_config.enforce_eager
+                )
                self.drafter.dummy_run(num_tokens, use_cudagraphs=use_cudagraphs)

        # This is necessary to avoid blocking DP.