[Bugfix] Use PIECEWISE cudagraphs on Blackwell if max_model_len > 131072 (#27114)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-06-08 15:29:09 +08:00 · 2025-10-17 15:47:18 -04:00 · 2025-10-17 15:47:18 -04:00 · 950cf9e58e
commit 950cf9e58e
parent 3125d79950
1 changed files with 39 additions and 17 deletions
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -350,26 +350,48 @@ class VllmConfig:
                    self.compilation_config.cudagraph_mode = (
                        CUDAGraphMode.FULL_AND_PIECEWISE
                    )
                    # pooling models and encoder-decoder models
                    # do not support full cudagraphs
                    if self.model_config is not None and (
                        self.model_config.pooler_config is not None
                        or self.model_config.is_encoder_decoder
                    ):
                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                    # decode context parallel do not support full cudagraphs now.
                    if self.parallel_config.decode_context_parallel_size > 1:
                        logger.warning(
                            "Decode context parallel (DCP) is enabled, which is "
                            "incompatible with full CUDA graphs. Set "
                            "cudagraph_mode to PIECEWISE."
                        )
                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                else:
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
            # if cudagraph_mode has full cudagraphs, we need to check support
            if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
                # decode context parallel does not support full cudagraphs
                if self.parallel_config.decode_context_parallel_size > 1:
                    logger.warning_once(
                        "Decode context parallel (DCP) is enabled, which is "
                        "incompatible with full CUDA graphs. "
                        "Overriding cudagraph_mode to PIECEWISE."
                    )
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                elif self.model_config is not None:
                    if self.model_config.pooler_config is not None:
                        logger.warning_once(
                            "Pooling models do not support full cudagraphs. "
                            "Overriding cudagraph_mode to PIECEWISE."
                        )
                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                    elif self.model_config.is_encoder_decoder:
                        logger.warning_once(
                            "Encoder-decoder models do not support full cudagraphs. "
                            "Overriding cudagraph_mode to PIECEWISE."
                        )
                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                    elif (
                        current_platform.is_cuda()
                        and current_platform.is_device_capability(100)
                        and self.model_config.max_model_len > 131072
                        and not self.model_config.use_mla
                    ):
                        # Refer to vllm/utils/flashinfer.py::use_trtllm_attention()
                        logger.warning_once(
                            "NVIDIA Blackwell TRTLLM attention cannot support "
                            "max_model_len >= 131072 (found "
                            f"{self.model_config.max_model_len}), causing dynamic "
                            "dispatching that breaks full cudagraphs. "
                            "Overriding cudagraph_mode to PIECEWISE."
                        )
                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
            # disable cudagraph when enforce eager execution
            if self.model_config is not None and self.model_config.enforce_eager:
                logger.info("Cudagraph is disabled under eager mode")