mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-05 08:27:04 +08:00
wip
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
parent
90d24dee04
commit
a1e3c09cba
@ -78,7 +78,6 @@ class EagleProposer:
|
||||
self.use_cuda_graph = (self.vllm_config.compilation_config.level
|
||||
== CompilationLevel.PIECEWISE and
|
||||
not self.vllm_config.model_config.enforce_eager)
|
||||
|
||||
self.cudagraph_runtime_mode = (CUDAGraphMode.PIECEWISE
|
||||
if self.use_cuda_graph else
|
||||
CUDAGraphMode.NONE)
|
||||
@ -678,12 +677,14 @@ class EagleProposer:
|
||||
def dummy_run(
|
||||
self,
|
||||
num_tokens: int,
|
||||
use_cudagraphs=True,
|
||||
) -> None:
|
||||
with set_forward_context(
|
||||
None,
|
||||
self.vllm_config,
|
||||
num_tokens=num_tokens,
|
||||
cudagraph_runtime_mode=self.cudagraph_runtime_mode,
|
||||
cudagraph_runtime_mode=self.cudagraph_runtime_mode \
|
||||
if use_cudagraphs else CUDAGraphMode.NONE,
|
||||
):
|
||||
if self.is_multimodal_model:
|
||||
input_ids = None
|
||||
|
||||
@ -2997,7 +2997,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
|
||||
if self.speculative_config and self.speculative_config.use_eagle():
|
||||
assert isinstance(self.drafter, EagleProposer)
|
||||
self.drafter.dummy_run(num_tokens)
|
||||
# For warmup runs don't use cudagraphs in drafter
|
||||
self.drafter.dummy_run(num_tokens, use_cudagraphs=False)
|
||||
|
||||
# This is necessary to avoid blocking DP.
|
||||
# For dummy runs, we typically skip EPLB since we don't have any real
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user