Remove chunked_prefill_enabled flag in V1 MLA (#23183)

Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
This commit is contained in:
Matthew Bonanni 2025-08-20 17:43:17 -04:00 committed by GitHub
parent 1b125004be
commit a4fbb32fab
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -416,7 +416,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
self.model_config = vllm_config.model_config
cache_config = vllm_config.cache_config
parallel_config = vllm_config.parallel_config
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
self.num_heads = self.model_config.get_num_attention_heads(
parallel_config)
self.mla_dims = get_mla_dims(self.model_config)
@ -426,13 +425,11 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
if self.aot_schedule:
self.page_size = self.kv_cache_spec.block_size
if self.chunked_prefill_enabled:
self.chunked_prefill_workspace_size = min(
# Max sure there is enough for 8 full length request or at least
# 4 pages of cache per request
max(
8 * self.model_config.max_model_len, 4 *
scheduler_config.max_num_seqs * cache_config.block_size),
max(8 * self.model_config.max_model_len,
4 * scheduler_config.max_num_seqs * cache_config.block_size),
# For long-context models try not to over-allocate limiting
# kv-cache space, limiting it to 64k tokens,
# which would result in the workspace being:
@ -620,8 +617,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
reqs_start:] - query_start_loc[reqs_start]
chunked_context_metadata = None
if self.chunked_prefill_enabled and num_prefills > 0 \
and max_context_len_cpu > 0:
if max_context_len_cpu > 0:
# NOTE: it is recommend you read the `Chunked Prefill` section
# in the comment at the top of the file before trying to
# understand the following code