mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 13:05:44 +08:00
[Perf] Change default CUDAGraphMode from PIECEWISE to FULL_AND_PIECEWISE (#25444)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
63400259d0
commit
24fab45d96
@ -509,6 +509,13 @@ class VllmConfig:
|
||||
if self.compilation_config.cudagraph_mode is None:
|
||||
if envs.VLLM_USE_V1 and self.compilation_config.level \
|
||||
== CompilationLevel.PIECEWISE:
|
||||
# default to full and piecewise for most models
|
||||
self.compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.FULL_AND_PIECEWISE
|
||||
|
||||
# pooling model does not support full cudagraphs
|
||||
if self.model_config is not None and \
|
||||
self.model_config.pooler_config is not None:
|
||||
self.compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.PIECEWISE
|
||||
else:
|
||||
|
||||
@ -228,15 +228,14 @@ class CompilationConfig:
|
||||
The mode of the cudagraph:
|
||||
|
||||
- NONE, no cudagraph capture.
|
||||
- PIECEWISE. (v1 default)
|
||||
- PIECEWISE.
|
||||
- FULL.
|
||||
- FULL_DECODE_ONLY.
|
||||
- FULL_AND_PIECEWISE.
|
||||
- FULL_AND_PIECEWISE. (v1 default)
|
||||
|
||||
PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
|
||||
incompatible ops (i.e. some attention ops) outside the cudagraph
|
||||
for general flexibility.
|
||||
This is the default mode.
|
||||
|
||||
FULL mode: Capture full cudagraph for all batches. Can be good for small
|
||||
models or workloads with small prompts; not supported by many backends.
|
||||
@ -249,7 +248,7 @@ class CompilationConfig:
|
||||
|
||||
FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
|
||||
piecewise cudagraph for prefill and mixed prefill-decode batches.
|
||||
This is like the most performant mode for most models.
|
||||
This is the most performant mode for most models and is the default.
|
||||
|
||||
Currently, the cudagraph mode is only used for the v1 engine.
|
||||
Note that the cudagraph logic is generally orthogonal to the
|
||||
|
||||
@ -2947,8 +2947,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
# TODO(luka) better system for describing dummy batches
|
||||
seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
|
||||
else:
|
||||
# Make sure max_model_len is used at the graph capture time.
|
||||
seq_lens = self.max_model_len
|
||||
seq_lens = max_query_len
|
||||
self.seq_lens.np[:num_reqs] = seq_lens
|
||||
self.seq_lens.np[num_reqs:] = 0
|
||||
self.seq_lens.copy_to_gpu()
|
||||
@ -3541,6 +3540,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
CUDAGraphMode.FULL_DECODE_ONLY
|
||||
logger.warning(msg)
|
||||
|
||||
# check that if we are doing decode full-cudagraphs it is supported
|
||||
if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
|
||||
and min_cg_support == AttentionCGSupport.NEVER):
|
||||
msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
|
||||
f"with {min_cg_builder_name} backend (support: "
|
||||
f"{min_cg_support})")
|
||||
if (self.compilation_config.level == CompilationLevel.PIECEWISE and
|
||||
(self.compilation_config.splitting_ops_contain_attention()
|
||||
or self.compilation_config.use_inductor_graph_partition)):
|
||||
msg += "; setting cudagraph_mode=PIECEWISE because "\
|
||||
"attention is compiled piecewise"
|
||||
cudagraph_mode = self.compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.PIECEWISE
|
||||
else:
|
||||
msg += "; setting cudagraph_mode=NONE because "\
|
||||
"attention is not compiled piecewise"
|
||||
cudagraph_mode = self.compilation_config.cudagraph_mode = \
|
||||
CUDAGraphMode.NONE
|
||||
logger.warning(msg)
|
||||
|
||||
# check that if we are doing spec-decode + decode full-cudagraphs it is
|
||||
# supported
|
||||
if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user