mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-18 11:56:11 +08:00
[Perf] Increase default max splits for FA3 full cudagraphs (#25495)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
parent
bde2a1a8a4
commit
e0b24ea030
@ -119,7 +119,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_SERVER_DEV_MODE: bool = False
|
VLLM_SERVER_DEV_MODE: bool = False
|
||||||
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
|
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
|
||||||
VLLM_MLA_DISABLE: bool = False
|
VLLM_MLA_DISABLE: bool = False
|
||||||
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 16
|
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32
|
||||||
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
|
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
|
||||||
VLLM_RAY_BUNDLE_INDICES: str = ""
|
VLLM_RAY_BUNDLE_INDICES: str = ""
|
||||||
VLLM_CUDART_SO_PATH: Optional[str] = None
|
VLLM_CUDART_SO_PATH: Optional[str] = None
|
||||||
@ -1017,7 +1017,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# max number splits for cuda graph decode
|
# max number splits for cuda graph decode
|
||||||
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH":
|
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH":
|
||||||
lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
|
lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
|
||||||
"16")),
|
"32")),
|
||||||
|
|
||||||
# Number of GPUs per worker in Ray, if it is set to be a fraction,
|
# Number of GPUs per worker in Ray, if it is set to be a fraction,
|
||||||
# it allows ray to schedule multiple actors on a single GPU,
|
# it allows ray to schedule multiple actors on a single GPU,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user