diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index df60cb130a1b3..a4ff01feb2a72 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -78,7 +78,6 @@ class EagleProposer: self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not self.vllm_config.model_config.enforce_eager) - self.cudagraph_runtime_mode = (CUDAGraphMode.PIECEWISE if self.use_cuda_graph else CUDAGraphMode.NONE) @@ -678,12 +677,14 @@ class EagleProposer: def dummy_run( self, num_tokens: int, + use_cudagraphs=True, ) -> None: with set_forward_context( None, self.vllm_config, num_tokens=num_tokens, - cudagraph_runtime_mode=self.cudagraph_runtime_mode, + cudagraph_runtime_mode=self.cudagraph_runtime_mode \ + if use_cudagraphs else CUDAGraphMode.NONE, ): if self.is_multimodal_model: input_ids = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f256dc160a6b5..f32a9d9a610cd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2997,7 +2997,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - self.drafter.dummy_run(num_tokens) + # For warmup runs don't use cudagraphs in drafter + self.drafter.dummy_run(num_tokens, use_cudagraphs=False) # This is necessary to avoid blocking DP. # For dummy runs, we typically skip EPLB since we don't have any real