[BugFix] Fix MLA + V1 + TP==1 causing reinitialization of cuda context (#14910)

2025-12-24 18:05:01 +08:00 · 2025-03-16 23:35:37 -04:00 · 2025-03-16 23:35:37 -04:00 · 1e799b7ec1
commit 1e799b7ec1
parent 7f6c5ee06c
1 changed files with 1 additions and 1 deletions
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -152,7 +152,7 @@ class CudaPlatformBase(Platform):
            # here
            use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
                or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
-            from vllm.attention.backends.flashmla import is_flashmla_supported
+            from vllm.attention.ops.flashmla import is_flashmla_supported
            if use_flashmla and is_flashmla_supported()[0] \
                and cache_config.block_size != 64:
                cache_config.block_size = 64