mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-31 00:23:10 +08:00
[Misc][PCP&DCP] relocate PCP feature check (#30050)
Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
parent
853611bb18
commit
a11f4a81e0
@ -294,6 +294,12 @@ class AttentionImpl(ABC, Generic[T]):
|
||||
# Some features like decode context parallelism require the softmax lse.
|
||||
can_return_lse_for_decode: bool = False
|
||||
|
||||
# Whether the attention impl supports Prefill Context Parallelism.
|
||||
supports_pcp: bool = False
|
||||
# Whether the attention impl(or ops) supports MTP
|
||||
# when cp_kv_cache_interleave_size > 1
|
||||
supports_mtp_with_cp_non_trivial_interleave_size: bool = False
|
||||
|
||||
# some attention backends might not always want to return lse
|
||||
# even if they can return lse (for efficiency reasons)
|
||||
need_to_return_lse_for_decode: bool = False
|
||||
|
||||
@ -317,11 +317,6 @@ class ParallelConfig:
|
||||
"num_redundant_experts."
|
||||
)
|
||||
|
||||
if self.prefill_context_parallel_size > 1:
|
||||
raise ValueError(
|
||||
"Prefill context parallelism is not fully supported. "
|
||||
"Please set prefill_context_parallel_size to 1."
|
||||
)
|
||||
return self
|
||||
|
||||
@property
|
||||
|
||||
@ -820,11 +820,6 @@ class VllmConfig:
|
||||
f"({self.parallel_config.cp_kv_cache_interleave_size})."
|
||||
)
|
||||
|
||||
assert (
|
||||
self.parallel_config.cp_kv_cache_interleave_size == 1
|
||||
or self.speculative_config is None
|
||||
), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
|
||||
|
||||
# Do this after all the updates to compilation_config.mode
|
||||
self.compilation_config.set_splitting_ops_for_v1(
|
||||
all2all_backend=self.parallel_config.all2all_backend,
|
||||
|
||||
@ -1848,16 +1848,6 @@ class EngineArgs:
|
||||
default_chunked_prefill = model_config.is_chunked_prefill_supported
|
||||
default_prefix_caching = model_config.is_prefix_caching_supported
|
||||
|
||||
if self.prefill_context_parallel_size > 1:
|
||||
default_chunked_prefill = False
|
||||
default_prefix_caching = False
|
||||
logger.warning_once(
|
||||
"--prefill-context-parallel-size > 1 is not compatible with "
|
||||
"chunked prefill and prefix caching now. Chunked prefill "
|
||||
"and prefix caching have been disabled by default.",
|
||||
scope="local",
|
||||
)
|
||||
|
||||
if self.enable_chunked_prefill is None:
|
||||
self.enable_chunked_prefill = default_chunked_prefill
|
||||
|
||||
|
||||
42
vllm/v1/worker/cp_utils.py
Normal file
42
vllm/v1/worker/cp_utils.py
Normal file
@ -0,0 +1,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
from vllm.config import VllmConfig, get_layers_from_vllm_config
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
else:
|
||||
AttentionLayerBase = object
|
||||
|
||||
|
||||
def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
|
||||
pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
|
||||
dcp_size = vllm_config.parallel_config.decode_context_parallel_size
|
||||
interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size
|
||||
if pcp_size * dcp_size > 1:
|
||||
layer_type = cast(type[Any], AttentionLayerBase)
|
||||
layers = get_layers_from_vllm_config(vllm_config, layer_type)
|
||||
for layer in layers.values():
|
||||
layer_impl = getattr(layer, "impl", None)
|
||||
if layer_impl is None:
|
||||
continue
|
||||
if vllm_config.speculative_config is not None and interleave_size > 1:
|
||||
assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, (
|
||||
"MTP with cp_kv_cache_interleave_size > 1 is not "
|
||||
f"supported in {layer_impl.__class__.__name__}."
|
||||
)
|
||||
if dcp_size > 1:
|
||||
assert layer_impl.need_to_return_lse_for_decode, (
|
||||
"DCP requires attention impls to return"
|
||||
" the softmax lse for decode, but the impl "
|
||||
f"{layer_impl.__class__.__name__} "
|
||||
"does not return the softmax lse for decode."
|
||||
)
|
||||
|
||||
if pcp_size > 1:
|
||||
assert layer_impl.supports_pcp, (
|
||||
"PCP requires attention impls' support, "
|
||||
f"but the impl {layer_impl.__class__.__name__} "
|
||||
"does not support PCP."
|
||||
)
|
||||
@ -148,6 +148,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||
from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
|
||||
from vllm.v1.structured_output.utils import apply_grammar_bitmask
|
||||
from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
|
||||
from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
|
||||
from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
|
||||
from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
|
||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
||||
@ -4736,6 +4737,9 @@ class GPUModelRunner(
|
||||
attention_backend_list, kv_cache_config.kv_cache_groups
|
||||
)
|
||||
|
||||
# Check if attention backend supports PCP&DCP and related features.
|
||||
check_attention_cp_compatibility(self.vllm_config)
|
||||
|
||||
for i, attn_backend_map in enumerate(attention_backend_maps):
|
||||
self.attn_groups.append(create_attn_groups(attn_backend_map, i))
|
||||
|
||||
@ -5394,20 +5398,6 @@ class GPUModelRunner(
|
||||
kv_transfer_group.register_kv_caches(kv_caches)
|
||||
kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
|
||||
|
||||
if self.dcp_world_size > 1:
|
||||
layer_type = cast(type[Any], AttentionLayerBase)
|
||||
layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
|
||||
for layer in layers.values():
|
||||
layer_impl = getattr(layer, "impl", None)
|
||||
if layer_impl is None:
|
||||
continue
|
||||
assert layer_impl.need_to_return_lse_for_decode, (
|
||||
"DCP requires attention impls to return"
|
||||
" the softmax lse for decode, but the impl "
|
||||
f"{layer_impl.__class__.__name__} "
|
||||
"does not return the softmax lse for decode."
|
||||
)
|
||||
|
||||
def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
|
||||
"""
|
||||
Add encoder-only layers to the KV cache config.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user