[bugfix] support eagle with lora cudagraph specialization (#28318)

Signed-off-by: gnovack <gnovack@amazon.com>
2026-07-17 21:37:21 +08:00 · 2025-11-07 19:25:45 -08:00 · 2025-11-07 19:25:45 -08:00 · 70af44fd10
commit 70af44fd10
parent 781f5ebf52
1 changed files with 12 additions and 1 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -3602,7 +3602,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                    cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
                    and not self.speculative_config.enforce_eager
                )
-                self.drafter.dummy_run(num_tokens, use_cudagraphs=use_cudagraphs)
+
                # Note(gnovack) - We need to disable cudagraphs for one of the two
                # lora cases when cudagraph_specialize_lora is enabled. This is a
                # short term mitigation for issue mentioned in
                # https://github.com/vllm-project/vllm/issues/28334
                if self.compilation_config.cudagraph_specialize_lora and activate_lora:
                    use_cudagraphs = False
                self.drafter.dummy_run(
                    num_tokens,
                    use_cudagraphs=use_cudagraphs,
                )
        # This is necessary to avoid blocking DP.
        # For dummy runs, we typically skip EPLB since we don't have any real