[BugFix] Fix MLA + V1 + TP==1 causing reinitialization of cuda context (#14910)

This commit is contained in:
Lucas Wilkinson 2025-03-16 23:35:37 -04:00 committed by GitHub
parent 7f6c5ee06c
commit 1e799b7ec1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -152,7 +152,7 @@ class CudaPlatformBase(Platform):
# here
use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
from vllm.attention.backends.flashmla import is_flashmla_supported
from vllm.attention.ops.flashmla import is_flashmla_supported
if use_flashmla and is_flashmla_supported()[0] \
and cache_config.block_size != 64:
cache_config.block_size = 64