From 1e799b7ec1b1c61952d2ae24c85ecf3fcb0f6de3 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sun, 16 Mar 2025 23:35:37 -0400 Subject: [PATCH] [BugFix] Fix MLA + V1 + TP==1 causing reinitialization of cuda context (#14910) --- vllm/platforms/cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 3897584307e91..8a53337ebc087 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -152,7 +152,7 @@ class CudaPlatformBase(Platform): # here use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \ or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA") - from vllm.attention.backends.flashmla import is_flashmla_supported + from vllm.attention.ops.flashmla import is_flashmla_supported if use_flashmla and is_flashmla_supported()[0] \ and cache_config.block_size != 64: cache_config.block_size = 64