From e0b24ea0305e0ead2c1cb1e0488744b5a91d524e Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 23 Sep 2025 19:53:34 -0400 Subject: [PATCH] [Perf] Increase default max splits for FA3 full cudagraphs (#25495) Signed-off-by: Lucas Wilkinson --- vllm/envs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 33dae0be05f8d..689428ec59109 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -119,7 +119,7 @@ if TYPE_CHECKING: VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 VLLM_MLA_DISABLE: bool = False - VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 16 + VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32 VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_CUDART_SO_PATH: Optional[str] = None @@ -1017,7 +1017,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # max number splits for cuda graph decode "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", - "16")), + "32")), # Number of GPUs per worker in Ray, if it is set to be a fraction, # it allows ray to schedule multiple actors on a single GPU,