mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-31 13:07:11 +08:00
[Bugfix] DeepSeek V3.2 MTP metadata & CUDA graph issues (#26779)
Signed-off-by: xiaohajiayou <923390377@qq.com>
This commit is contained in:
parent
30a14b034f
commit
d811b442d3
@ -109,6 +109,7 @@ class EagleProposer:
|
|||||||
else []
|
else []
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes)
|
||||||
# persistent buffers for cuda graph
|
# persistent buffers for cuda graph
|
||||||
self.input_ids = torch.zeros(
|
self.input_ids = torch.zeros(
|
||||||
self.max_num_tokens, dtype=torch.int32, device=device
|
self.max_num_tokens, dtype=torch.int32, device=device
|
||||||
@ -939,7 +940,7 @@ class EagleProposer:
|
|||||||
self.vllm_config, DeepseekV32IndexerCache
|
self.vllm_config, DeepseekV32IndexerCache
|
||||||
)
|
)
|
||||||
draft_indexer_layer_names = indexer_layers.keys() - target_indexer_layer_names
|
draft_indexer_layer_names = indexer_layers.keys() - target_indexer_layer_names
|
||||||
self.attn_layer_names = list(draft_attn_layer_names)
|
self.attn_layer_names = list(draft_attn_layer_names - draft_indexer_layer_names)
|
||||||
self.indexer_layer_names = list(draft_indexer_layer_names)
|
self.indexer_layer_names = list(draft_indexer_layer_names)
|
||||||
|
|
||||||
if self.indexer_layer_names:
|
if self.indexer_layer_names:
|
||||||
@ -1050,16 +1051,18 @@ class EagleProposer:
|
|||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
use_cudagraphs=True,
|
use_cudagraphs=True,
|
||||||
) -> None:
|
) -> None:
|
||||||
if use_cudagraphs and num_tokens <= self.cudagraph_batch_sizes[-1]:
|
# Determine if CUDA graphs should be used for this run.
|
||||||
|
cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
|
||||||
|
if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]:
|
||||||
num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
|
num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
|
||||||
|
|
||||||
with set_forward_context(
|
with set_forward_context(
|
||||||
None,
|
None,
|
||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
num_tokens=num_tokens,
|
num_tokens=num_tokens,
|
||||||
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE
|
cudagraph_runtime_mode=(
|
||||||
if use_cudagraphs
|
CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
|
||||||
else CUDAGraphMode.NONE,
|
),
|
||||||
):
|
):
|
||||||
if self.supports_mm_inputs:
|
if self.supports_mm_inputs:
|
||||||
input_ids = None
|
input_ids = None
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user