mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-22 17:25:41 +08:00
[Core] Whisper enable FULL_DECODE_ONLY CudaGraph (#30072)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
parent
d017bceb08
commit
c756fb6781
@ -103,6 +103,8 @@ def run_test(
|
||||
max_model_len=448,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
# TODO (NickLucche) figure out output differences with non-eager and re-enable
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
|
||||
|
||||
@ -666,8 +666,9 @@ class VllmConfig:
|
||||
|
||||
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
|
||||
self._apply_optimization_level_defaults(default_config)
|
||||
|
||||
if (
|
||||
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||
self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
|
||||
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
||||
):
|
||||
logger.info(
|
||||
@ -692,22 +693,29 @@ class VllmConfig:
|
||||
|
||||
if current_platform.support_static_graph_mode():
|
||||
# if cudagraph_mode has full cudagraphs, we need to check support
|
||||
if (
|
||||
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
and self.model_config is not None
|
||||
):
|
||||
if self.model_config.pooler_config is not None:
|
||||
if model_config := self.model_config:
|
||||
if (
|
||||
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
and model_config.pooler_config is not None
|
||||
):
|
||||
logger.warning_once(
|
||||
"Pooling models do not support full cudagraphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
elif self.model_config.is_encoder_decoder:
|
||||
logger.warning_once(
|
||||
"Encoder-decoder models do not support full cudagraphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
elif (
|
||||
model_config.is_encoder_decoder
|
||||
and self.compilation_config.cudagraph_mode
|
||||
not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
|
||||
):
|
||||
logger.info_once(
|
||||
"Encoder-decoder models do not support %s. "
|
||||
"Overriding cudagraph_mode to FULL_DECODE_ONLY.",
|
||||
self.compilation_config.cudagraph_mode.name,
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = (
|
||||
CUDAGraphMode.FULL_DECODE_ONLY
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
|
||||
# disable cudagraph when enforce eager execution
|
||||
if self.model_config is not None and self.model_config.enforce_eager:
|
||||
|
||||
@ -1267,6 +1267,8 @@ class GPUModelRunner(
|
||||
if not isinstance(kv_cache_spec, CrossAttentionSpec):
|
||||
return None, None
|
||||
|
||||
# Zero out buffer for padding requests that are not actually scheduled (CGs)
|
||||
self.encoder_seq_lens.np[:num_reqs] = 0
|
||||
# Build encoder_seq_lens array mapping request indices to
|
||||
# encoder lengths for inputs scheduled in this batch
|
||||
for req_id in num_scheduled_tokens:
|
||||
@ -2764,6 +2766,7 @@ class GPUModelRunner(
|
||||
# be improved in model runner v2)
|
||||
force_uniform_decode: bool | None = None,
|
||||
force_has_lora: bool | None = None,
|
||||
num_encoder_reqs: int = 0,
|
||||
) -> tuple[
|
||||
CUDAGraphMode,
|
||||
BatchDescriptor,
|
||||
@ -2780,6 +2783,11 @@ class GPUModelRunner(
|
||||
if force_uniform_decode is None
|
||||
else force_uniform_decode
|
||||
)
|
||||
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
|
||||
# is present). Also, chunked-prefill is disabled, so batch are uniform.
|
||||
has_encoder_output = (
|
||||
self.model_config.is_encoder_decoder and num_encoder_reqs > 0
|
||||
)
|
||||
|
||||
has_lora = (
|
||||
len(self.input_batch.lora_id_to_lora_request) > 0
|
||||
@ -2799,7 +2807,7 @@ class GPUModelRunner(
|
||||
)
|
||||
|
||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||
num_tokens_padded, use_cascade_attn
|
||||
num_tokens_padded, use_cascade_attn or has_encoder_output
|
||||
)
|
||||
num_tokens_padded = batch_descriptor.num_tokens
|
||||
|
||||
@ -2997,6 +3005,7 @@ class GPUModelRunner(
|
||||
num_scheduled_tokens_np=num_scheduled_tokens_np,
|
||||
max_num_scheduled_tokens=max_num_scheduled_tokens,
|
||||
use_cascade_attn=cascade_attn_prefix_lens is not None,
|
||||
num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user