mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-19 06:25:01 +08:00
Remove chunked_prefill_enabled flag in V1 MLA (#23183)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
This commit is contained in:
parent
1b125004be
commit
a4fbb32fab
@ -416,7 +416,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
|
|||||||
self.model_config = vllm_config.model_config
|
self.model_config = vllm_config.model_config
|
||||||
cache_config = vllm_config.cache_config
|
cache_config = vllm_config.cache_config
|
||||||
parallel_config = vllm_config.parallel_config
|
parallel_config = vllm_config.parallel_config
|
||||||
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
|
|
||||||
self.num_heads = self.model_config.get_num_attention_heads(
|
self.num_heads = self.model_config.get_num_attention_heads(
|
||||||
parallel_config)
|
parallel_config)
|
||||||
self.mla_dims = get_mla_dims(self.model_config)
|
self.mla_dims = get_mla_dims(self.model_config)
|
||||||
@ -426,13 +425,11 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
|
|||||||
if self.aot_schedule:
|
if self.aot_schedule:
|
||||||
self.page_size = self.kv_cache_spec.block_size
|
self.page_size = self.kv_cache_spec.block_size
|
||||||
|
|
||||||
if self.chunked_prefill_enabled:
|
|
||||||
self.chunked_prefill_workspace_size = min(
|
self.chunked_prefill_workspace_size = min(
|
||||||
# Max sure there is enough for 8 full length request or at least
|
# Max sure there is enough for 8 full length request or at least
|
||||||
# 4 pages of cache per request
|
# 4 pages of cache per request
|
||||||
max(
|
max(8 * self.model_config.max_model_len,
|
||||||
8 * self.model_config.max_model_len, 4 *
|
4 * scheduler_config.max_num_seqs * cache_config.block_size),
|
||||||
scheduler_config.max_num_seqs * cache_config.block_size),
|
|
||||||
# For long-context models try not to over-allocate limiting
|
# For long-context models try not to over-allocate limiting
|
||||||
# kv-cache space, limiting it to 64k tokens,
|
# kv-cache space, limiting it to 64k tokens,
|
||||||
# which would result in the workspace being:
|
# which would result in the workspace being:
|
||||||
@ -620,8 +617,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
|
|||||||
reqs_start:] - query_start_loc[reqs_start]
|
reqs_start:] - query_start_loc[reqs_start]
|
||||||
|
|
||||||
chunked_context_metadata = None
|
chunked_context_metadata = None
|
||||||
if self.chunked_prefill_enabled and num_prefills > 0 \
|
if max_context_len_cpu > 0:
|
||||||
and max_context_len_cpu > 0:
|
|
||||||
# NOTE: it is recommend you read the `Chunked Prefill` section
|
# NOTE: it is recommend you read the `Chunked Prefill` section
|
||||||
# in the comment at the top of the file before trying to
|
# in the comment at the top of the file before trying to
|
||||||
# understand the following code
|
# understand the following code
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user