Remove chunked_prefill_enabled flag in V1 MLA (#23183)

Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
2026-07-18 07:37:10 +08:00 · 2025-08-20 17:43:17 -04:00 · 2025-08-20 17:43:17 -04:00 · a4fbb32fab
commit a4fbb32fab
parent 1b125004be
1 changed files with 23 additions and 27 deletions
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@ -416,7 +416,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
        self.model_config = vllm_config.model_config
        cache_config = vllm_config.cache_config
        parallel_config = vllm_config.parallel_config
        self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
        self.num_heads = self.model_config.get_num_attention_heads(
            parallel_config)
        self.mla_dims = get_mla_dims(self.model_config)
@ -426,13 +425,11 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
        if self.aot_schedule:
            self.page_size = self.kv_cache_spec.block_size
        if self.chunked_prefill_enabled:
        self.chunked_prefill_workspace_size = min(
            # Max sure there is enough for 8 full length request or at least
            # 4 pages of cache per request
-                max(
+            max(8 * self.model_config.max_model_len,
-                    8 * self.model_config.max_model_len, 4 *
+                4 * scheduler_config.max_num_seqs * cache_config.block_size),
                    scheduler_config.max_num_seqs * cache_config.block_size),
            # For long-context models try not to over-allocate limiting
            # kv-cache space, limiting it to 64k tokens,
            # which would result in the workspace being:
@ -620,8 +617,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                reqs_start:] - query_start_loc[reqs_start]
            chunked_context_metadata = None
-            if self.chunked_prefill_enabled and num_prefills > 0 \
+            if max_context_len_cpu > 0:
                and max_context_len_cpu > 0:
                # NOTE: it is recommend you read the `Chunked Prefill` section
                # in the comment at the top of the file before trying to
                # understand the following code