mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 20:45:33 +08:00
[Bugfix] skip cuda graph for drafter when running with eager (#26821)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
This commit is contained in:
parent
4a8a567e16
commit
19748806f0
@ -3482,7 +3482,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
|
||||
if self.speculative_config and self.speculative_config.use_eagle():
|
||||
assert isinstance(self.drafter, EagleProposer)
|
||||
use_cudagraphs = cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
|
||||
use_cudagraphs = (
|
||||
cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
|
||||
and not self.speculative_config.enforce_eager
|
||||
)
|
||||
self.drafter.dummy_run(num_tokens, use_cudagraphs=use_cudagraphs)
|
||||
|
||||
# This is necessary to avoid blocking DP.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user