mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-22 19:25:01 +08:00
[Core] Whisper enable FULL_DECODE_ONLY CudaGraph (#30072)
Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
parent
d017bceb08
commit
c756fb6781
@ -103,6 +103,8 @@ def run_test(
|
|||||||
max_model_len=448,
|
max_model_len=448,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
# TODO (NickLucche) figure out output differences with non-eager and re-enable
|
||||||
|
enforce_eager=True,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
llm = vllm_model.llm
|
llm = vllm_model.llm
|
||||||
|
|
||||||
|
|||||||
@ -666,8 +666,9 @@ class VllmConfig:
|
|||||||
|
|
||||||
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
|
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
|
||||||
self._apply_optimization_level_defaults(default_config)
|
self._apply_optimization_level_defaults(default_config)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
|
||||||
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
||||||
):
|
):
|
||||||
logger.info(
|
logger.info(
|
||||||
@ -692,22 +693,29 @@ class VllmConfig:
|
|||||||
|
|
||||||
if current_platform.support_static_graph_mode():
|
if current_platform.support_static_graph_mode():
|
||||||
# if cudagraph_mode has full cudagraphs, we need to check support
|
# if cudagraph_mode has full cudagraphs, we need to check support
|
||||||
if (
|
if model_config := self.model_config:
|
||||||
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
if (
|
||||||
and self.model_config is not None
|
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||||
):
|
and model_config.pooler_config is not None
|
||||||
if self.model_config.pooler_config is not None:
|
):
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Pooling models do not support full cudagraphs. "
|
"Pooling models do not support full cudagraphs. "
|
||||||
"Overriding cudagraph_mode to PIECEWISE."
|
"Overriding cudagraph_mode to PIECEWISE."
|
||||||
)
|
)
|
||||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||||
elif self.model_config.is_encoder_decoder:
|
elif (
|
||||||
logger.warning_once(
|
model_config.is_encoder_decoder
|
||||||
"Encoder-decoder models do not support full cudagraphs. "
|
and self.compilation_config.cudagraph_mode
|
||||||
"Overriding cudagraph_mode to PIECEWISE."
|
not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
|
||||||
|
):
|
||||||
|
logger.info_once(
|
||||||
|
"Encoder-decoder models do not support %s. "
|
||||||
|
"Overriding cudagraph_mode to FULL_DECODE_ONLY.",
|
||||||
|
self.compilation_config.cudagraph_mode.name,
|
||||||
|
)
|
||||||
|
self.compilation_config.cudagraph_mode = (
|
||||||
|
CUDAGraphMode.FULL_DECODE_ONLY
|
||||||
)
|
)
|
||||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
|
||||||
|
|
||||||
# disable cudagraph when enforce eager execution
|
# disable cudagraph when enforce eager execution
|
||||||
if self.model_config is not None and self.model_config.enforce_eager:
|
if self.model_config is not None and self.model_config.enforce_eager:
|
||||||
|
|||||||
@ -1267,6 +1267,8 @@ class GPUModelRunner(
|
|||||||
if not isinstance(kv_cache_spec, CrossAttentionSpec):
|
if not isinstance(kv_cache_spec, CrossAttentionSpec):
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
# Zero out buffer for padding requests that are not actually scheduled (CGs)
|
||||||
|
self.encoder_seq_lens.np[:num_reqs] = 0
|
||||||
# Build encoder_seq_lens array mapping request indices to
|
# Build encoder_seq_lens array mapping request indices to
|
||||||
# encoder lengths for inputs scheduled in this batch
|
# encoder lengths for inputs scheduled in this batch
|
||||||
for req_id in num_scheduled_tokens:
|
for req_id in num_scheduled_tokens:
|
||||||
@ -2764,6 +2766,7 @@ class GPUModelRunner(
|
|||||||
# be improved in model runner v2)
|
# be improved in model runner v2)
|
||||||
force_uniform_decode: bool | None = None,
|
force_uniform_decode: bool | None = None,
|
||||||
force_has_lora: bool | None = None,
|
force_has_lora: bool | None = None,
|
||||||
|
num_encoder_reqs: int = 0,
|
||||||
) -> tuple[
|
) -> tuple[
|
||||||
CUDAGraphMode,
|
CUDAGraphMode,
|
||||||
BatchDescriptor,
|
BatchDescriptor,
|
||||||
@ -2780,6 +2783,11 @@ class GPUModelRunner(
|
|||||||
if force_uniform_decode is None
|
if force_uniform_decode is None
|
||||||
else force_uniform_decode
|
else force_uniform_decode
|
||||||
)
|
)
|
||||||
|
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
|
||||||
|
# is present). Also, chunked-prefill is disabled, so batch are uniform.
|
||||||
|
has_encoder_output = (
|
||||||
|
self.model_config.is_encoder_decoder and num_encoder_reqs > 0
|
||||||
|
)
|
||||||
|
|
||||||
has_lora = (
|
has_lora = (
|
||||||
len(self.input_batch.lora_id_to_lora_request) > 0
|
len(self.input_batch.lora_id_to_lora_request) > 0
|
||||||
@ -2799,7 +2807,7 @@ class GPUModelRunner(
|
|||||||
)
|
)
|
||||||
|
|
||||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||||
num_tokens_padded, use_cascade_attn
|
num_tokens_padded, use_cascade_attn or has_encoder_output
|
||||||
)
|
)
|
||||||
num_tokens_padded = batch_descriptor.num_tokens
|
num_tokens_padded = batch_descriptor.num_tokens
|
||||||
|
|
||||||
@ -2997,6 +3005,7 @@ class GPUModelRunner(
|
|||||||
num_scheduled_tokens_np=num_scheduled_tokens_np,
|
num_scheduled_tokens_np=num_scheduled_tokens_np,
|
||||||
max_num_scheduled_tokens=max_num_scheduled_tokens,
|
max_num_scheduled_tokens=max_num_scheduled_tokens,
|
||||||
use_cascade_attn=cascade_attn_prefix_lens is not None,
|
use_cascade_attn=cascade_attn_prefix_lens is not None,
|
||||||
|
num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user