mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 23:54:56 +08:00
[bugfix] support eagle with lora cudagraph specialization (#28318)
Signed-off-by: gnovack <gnovack@amazon.com>
This commit is contained in:
parent
781f5ebf52
commit
70af44fd10
@ -3602,7 +3602,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
|
cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
|
||||||
and not self.speculative_config.enforce_eager
|
and not self.speculative_config.enforce_eager
|
||||||
)
|
)
|
||||||
self.drafter.dummy_run(num_tokens, use_cudagraphs=use_cudagraphs)
|
|
||||||
|
# Note(gnovack) - We need to disable cudagraphs for one of the two
|
||||||
|
# lora cases when cudagraph_specialize_lora is enabled. This is a
|
||||||
|
# short term mitigation for issue mentioned in
|
||||||
|
# https://github.com/vllm-project/vllm/issues/28334
|
||||||
|
if self.compilation_config.cudagraph_specialize_lora and activate_lora:
|
||||||
|
use_cudagraphs = False
|
||||||
|
|
||||||
|
self.drafter.dummy_run(
|
||||||
|
num_tokens,
|
||||||
|
use_cudagraphs=use_cudagraphs,
|
||||||
|
)
|
||||||
|
|
||||||
# This is necessary to avoid blocking DP.
|
# This is necessary to avoid blocking DP.
|
||||||
# For dummy runs, we typically skip EPLB since we don't have any real
|
# For dummy runs, we typically skip EPLB since we don't have any real
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user