From c765f0b443c2de886a5771efcbc154ee84c2c19d Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 5 Nov 2025 09:25:32 -0800 Subject: [PATCH] [FlashInfer] Avoid FlashInfer block_size 16 + head_size 256 on blackwell (#27994) Signed-off-by: Chen Zhang --- vllm/model_executor/models/config.py | 12 ++++++++++++ vllm/v1/attention/backends/flashinfer.py | 9 +++++++++ 2 files changed, 21 insertions(+) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 936e59117232f..33fa06fe0e9bc 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry +from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec @@ -356,6 +357,17 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): ).page_size_bytes else: kernel_block_alignment_size = 16 + if ( + current_platform.is_device_capability(100) + and model_config.get_head_size() == 256 + and ( + envs.VLLM_ATTENTION_BACKEND is None + or envs.VLLM_ATTENTION_BACKEND == "FLASHINFER" + ) + ): + # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that` + # head size 256 and block size 16 is not supported on blackwell. + kernel_block_alignment_size = 32 attn_page_size_1_token = FullAttentionSpec( block_size=1, num_kv_heads=model_config.get_num_kv_heads(parallel_config), diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index e71d4ca4629dc..ddc63b902dffb 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -402,6 +402,15 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ) self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy() + if self.head_dim == 256 and current_platform.is_device_capability(100): + # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that + # head size 256 and block size 16 is not supported on blackwell. + assert kv_cache_spec.block_size != 16, ( + "There is a bug in FlashInfer " + "block_size 16 head size 256 support. Please avoid this combination by " + "passing --block-size 32 or --block-size 64." + ) + def _get_workspace_buffer(self): if self._workspace_buffer is None: buffer_size = FLASHINFER_WORKSPACE_BUFFER_SIZE