mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-31 20:07:08 +08:00
wip
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
parent
90d24dee04
commit
a1e3c09cba
@ -78,7 +78,6 @@ class EagleProposer:
|
|||||||
self.use_cuda_graph = (self.vllm_config.compilation_config.level
|
self.use_cuda_graph = (self.vllm_config.compilation_config.level
|
||||||
== CompilationLevel.PIECEWISE and
|
== CompilationLevel.PIECEWISE and
|
||||||
not self.vllm_config.model_config.enforce_eager)
|
not self.vllm_config.model_config.enforce_eager)
|
||||||
|
|
||||||
self.cudagraph_runtime_mode = (CUDAGraphMode.PIECEWISE
|
self.cudagraph_runtime_mode = (CUDAGraphMode.PIECEWISE
|
||||||
if self.use_cuda_graph else
|
if self.use_cuda_graph else
|
||||||
CUDAGraphMode.NONE)
|
CUDAGraphMode.NONE)
|
||||||
@ -678,12 +677,14 @@ class EagleProposer:
|
|||||||
def dummy_run(
|
def dummy_run(
|
||||||
self,
|
self,
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
|
use_cudagraphs=True,
|
||||||
) -> None:
|
) -> None:
|
||||||
with set_forward_context(
|
with set_forward_context(
|
||||||
None,
|
None,
|
||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
num_tokens=num_tokens,
|
num_tokens=num_tokens,
|
||||||
cudagraph_runtime_mode=self.cudagraph_runtime_mode,
|
cudagraph_runtime_mode=self.cudagraph_runtime_mode \
|
||||||
|
if use_cudagraphs else CUDAGraphMode.NONE,
|
||||||
):
|
):
|
||||||
if self.is_multimodal_model:
|
if self.is_multimodal_model:
|
||||||
input_ids = None
|
input_ids = None
|
||||||
|
|||||||
@ -2997,7 +2997,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
|
|
||||||
if self.speculative_config and self.speculative_config.use_eagle():
|
if self.speculative_config and self.speculative_config.use_eagle():
|
||||||
assert isinstance(self.drafter, EagleProposer)
|
assert isinstance(self.drafter, EagleProposer)
|
||||||
self.drafter.dummy_run(num_tokens)
|
# For warmup runs don't use cudagraphs in drafter
|
||||||
|
self.drafter.dummy_run(num_tokens, use_cudagraphs=False)
|
||||||
|
|
||||||
# This is necessary to avoid blocking DP.
|
# This is necessary to avoid blocking DP.
|
||||||
# For dummy runs, we typically skip EPLB since we don't have any real
|
# For dummy runs, we typically skip EPLB since we don't have any real
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user