wip

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-07-06 22:27:21 +08:00 · 2025-09-17 22:41:43 +00:00 · 2025-09-17 22:41:43 +00:00 · a1e3c09cba
commit a1e3c09cba
parent 90d24dee04
2 changed files with 5 additions and 3 deletions
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@ -78,7 +78,6 @@ class EagleProposer:
        self.use_cuda_graph = (self.vllm_config.compilation_config.level
                               == CompilationLevel.PIECEWISE and
                               not self.vllm_config.model_config.enforce_eager)
-
        self.cudagraph_runtime_mode = (CUDAGraphMode.PIECEWISE
                                       if self.use_cuda_graph else
                                       CUDAGraphMode.NONE)
@ -678,12 +677,14 @@ class EagleProposer:
    def dummy_run(
        self,
        num_tokens: int,
+        use_cudagraphs=True,
    ) -> None:
        with set_forward_context(
                None,
                self.vllm_config,
                num_tokens=num_tokens,
-                cudagraph_runtime_mode=self.cudagraph_runtime_mode,
+                cudagraph_runtime_mode=self.cudagraph_runtime_mode \
+                    if use_cudagraphs else CUDAGraphMode.NONE,
        ):
            if self.is_multimodal_model:
                input_ids = None
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -2997,7 +2997,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):

            if self.speculative_config and self.speculative_config.use_eagle():
                assert isinstance(self.drafter, EagleProposer)
-                self.drafter.dummy_run(num_tokens)
+                # For warmup runs don't use cudagraphs in drafter
+                self.drafter.dummy_run(num_tokens, use_cudagraphs=False)

        # This is necessary to avoid blocking DP.
        # For dummy runs, we typically skip EPLB since we don't have any real