mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 19:45:21 +08:00
[PERF] Remove TRTLLM Gen attn kernel limitation max_seq_len <=131072 (#28755)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
This commit is contained in:
parent
638e4196d1
commit
173b356abf
@ -483,21 +483,6 @@ class VllmConfig:
|
|||||||
"Overriding cudagraph_mode to PIECEWISE."
|
"Overriding cudagraph_mode to PIECEWISE."
|
||||||
)
|
)
|
||||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||||
elif (
|
|
||||||
current_platform.is_cuda()
|
|
||||||
and current_platform.is_device_capability(100)
|
|
||||||
and self.model_config.max_model_len > 131072
|
|
||||||
and not self.model_config.use_mla
|
|
||||||
):
|
|
||||||
# Refer to vllm/utils/flashinfer.py::use_trtllm_attention()
|
|
||||||
logger.warning_once(
|
|
||||||
"NVIDIA Blackwell TRTLLM attention cannot support "
|
|
||||||
"max_model_len >= 131072 (found "
|
|
||||||
f"{self.model_config.max_model_len}), causing dynamic "
|
|
||||||
"dispatching that breaks full cudagraphs. "
|
|
||||||
"Overriding cudagraph_mode to PIECEWISE."
|
|
||||||
)
|
|
||||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
|
||||||
|
|
||||||
# disable cudagraph when enforce eager execution
|
# disable cudagraph when enforce eager execution
|
||||||
if self.model_config is not None and self.model_config.enforce_eager:
|
if self.model_config is not None and self.model_config.enforce_eager:
|
||||||
|
|||||||
@ -319,14 +319,12 @@ def use_trtllm_attention(
|
|||||||
# Environment variable not set - use auto-detection
|
# Environment variable not set - use auto-detection
|
||||||
if is_prefill:
|
if is_prefill:
|
||||||
# Prefill auto-detection
|
# Prefill auto-detection
|
||||||
use_trtllm = max_seq_len <= 131072 and kv_cache_dtype == "auto"
|
use_trtllm = kv_cache_dtype == "auto"
|
||||||
if use_trtllm:
|
if use_trtllm:
|
||||||
logger.warning_once("Using TRTLLM prefill attention (auto-detected).")
|
logger.warning_once("Using TRTLLM prefill attention (auto-detected).")
|
||||||
else:
|
else:
|
||||||
# Decode auto-detection
|
# Decode auto-detection
|
||||||
use_trtllm = (
|
use_trtllm = num_tokens <= 256 and kv_cache_dtype == "auto"
|
||||||
num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto"
|
|
||||||
)
|
|
||||||
if use_trtllm:
|
if use_trtllm:
|
||||||
logger.warning_once("Using TRTLLM decode attention (auto-detected).")
|
logger.warning_once("Using TRTLLM decode attention (auto-detected).")
|
||||||
return use_trtllm
|
return use_trtllm
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user