From da65bec3096b704f70606e9d11ee18aae453bbae Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Fri, 22 Aug 2025 12:25:45 -0700 Subject: [PATCH] add an env var for path to pre-downloaded flashinfer cubin files (#22675) --- vllm/envs.py | 6 ++++++ vllm/utils/flashinfer.py | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/vllm/envs.py b/vllm/envs.py index 296c1730892da..7ca6cee9abee9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -158,6 +158,7 @@ if TYPE_CHECKING: VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False VLLM_ENABLE_RESPONSES_API_STORE: bool = False VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None + VLLM_HAS_FLASHINFER_CUBIN: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None @@ -1105,6 +1106,11 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_TRTLLM_ATTENTION": lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None), + # If set, it means we pre-downloaded cubin files and flashinfer will + # read the cubin files directly. + "VLLM_HAS_FLASHINFER_CUBIN": + lambda: os.getenv("VLLM_HAS_FLASHINFER_CUBIN", False), + # If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer. # Otherwise, uses the first available of: flashinfer cutlass GEMM, # vllm cutlass GEMM, marlin GEMM. diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 996be1265667c..5dd239c50f637 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -132,6 +132,11 @@ def has_nvidia_artifactory() -> bool: This checks connectivity to the kernel inference library artifactory which is required for downloading certain cubin kernels like TRTLLM FHMA. """ + # Since FLASHINFER_CUBIN_DIR defines the pre-downloaded cubins path, when + # it's true, we could assume the cubins are available. + if envs.VLLM_HAS_FLASHINFER_CUBIN: + return True + try: # Use a short timeout to avoid blocking for too long response = requests.get(FLASHINFER_CUBINS_REPOSITORY, timeout=5)