Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
Lucas Wilkinson 2025-09-17 22:41:43 +00:00
parent 90d24dee04
commit a1e3c09cba
2 changed files with 5 additions and 3 deletions

View File

@ -78,7 +78,6 @@ class EagleProposer:
self.use_cuda_graph = (self.vllm_config.compilation_config.level
== CompilationLevel.PIECEWISE and
not self.vllm_config.model_config.enforce_eager)
self.cudagraph_runtime_mode = (CUDAGraphMode.PIECEWISE
if self.use_cuda_graph else
CUDAGraphMode.NONE)
@ -678,12 +677,14 @@ class EagleProposer:
def dummy_run(
self,
num_tokens: int,
use_cudagraphs=True,
) -> None:
with set_forward_context(
None,
self.vllm_config,
num_tokens=num_tokens,
cudagraph_runtime_mode=self.cudagraph_runtime_mode,
cudagraph_runtime_mode=self.cudagraph_runtime_mode \
if use_cudagraphs else CUDAGraphMode.NONE,
):
if self.is_multimodal_model:
input_ids = None

View File

@ -2997,7 +2997,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
if self.speculative_config and self.speculative_config.use_eagle():
assert isinstance(self.drafter, EagleProposer)
self.drafter.dummy_run(num_tokens)
# For warmup runs don't use cudagraphs in drafter
self.drafter.dummy_run(num_tokens, use_cudagraphs=False)
# This is necessary to avoid blocking DP.
# For dummy runs, we typically skip EPLB since we don't have any real