[Misc][PCP&DCP] relocate PCP feature check (#30050)

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-12-31 00:23:10 +08:00 · 2025-12-11 19:36:18 +08:00 · 2025-12-11 19:36:18 +08:00 · a11f4a81e0
commit a11f4a81e0
parent 853611bb18
6 changed files with 52 additions and 34 deletions
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@ -294,6 +294,12 @@ class AttentionImpl(ABC, Generic[T]):
    # Some features like decode context parallelism require the softmax lse.
    can_return_lse_for_decode: bool = False

+    # Whether the attention impl supports Prefill Context Parallelism.
+    supports_pcp: bool = False
+    # Whether the attention impl(or ops) supports MTP
+    # when cp_kv_cache_interleave_size > 1
+    supports_mtp_with_cp_non_trivial_interleave_size: bool = False
+
    # some attention backends might not always want to return lse
    # even if they can return lse (for efficiency reasons)
    need_to_return_lse_for_decode: bool = False
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@ -317,11 +317,6 @@ class ParallelConfig:
                    "num_redundant_experts."
                )

-        if self.prefill_context_parallel_size > 1:
-            raise ValueError(
-                "Prefill context parallelism is not fully supported. "
-                "Please set prefill_context_parallel_size to 1."
-            )
        return self

    @property
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -820,11 +820,6 @@ class VllmConfig:
                f"({self.parallel_config.cp_kv_cache_interleave_size})."
            )

-        assert (
-            self.parallel_config.cp_kv_cache_interleave_size == 1
-            or self.speculative_config is None
-        ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
-
        # Do this after all the updates to compilation_config.mode
        self.compilation_config.set_splitting_ops_for_v1(
            all2all_backend=self.parallel_config.all2all_backend,
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1848,16 +1848,6 @@ class EngineArgs:
        default_chunked_prefill = model_config.is_chunked_prefill_supported
        default_prefix_caching = model_config.is_prefix_caching_supported

-        if self.prefill_context_parallel_size > 1:
-            default_chunked_prefill = False
-            default_prefix_caching = False
-            logger.warning_once(
-                "--prefill-context-parallel-size > 1 is not compatible with "
-                "chunked prefill and prefix caching now. Chunked prefill "
-                "and prefix caching have been disabled by default.",
-                scope="local",
-            )
-
        if self.enable_chunked_prefill is None:
            self.enable_chunked_prefill = default_chunked_prefill

--- a/vllm/v1/worker/cp_utils.py
+++ b/vllm/v1/worker/cp_utils.py
@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Any, cast
+
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+else:
+    AttentionLayerBase = object
+
+
+def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
+    pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
+    dcp_size = vllm_config.parallel_config.decode_context_parallel_size
+    interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size
+    if pcp_size * dcp_size > 1:
+        layer_type = cast(type[Any], AttentionLayerBase)
+        layers = get_layers_from_vllm_config(vllm_config, layer_type)
+        for layer in layers.values():
+            layer_impl = getattr(layer, "impl", None)
+            if layer_impl is None:
+                continue
+            if vllm_config.speculative_config is not None and interleave_size > 1:
+                assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, (
+                    "MTP with cp_kv_cache_interleave_size > 1 is not "
+                    f"supported in {layer_impl.__class__.__name__}."
+                )
+            if dcp_size > 1:
+                assert layer_impl.need_to_return_lse_for_decode, (
+                    "DCP requires attention impls to return"
+                    " the softmax lse for decode, but the impl "
+                    f"{layer_impl.__class__.__name__} "
+                    "does not return the softmax lse for decode."
+                )
+
+            if pcp_size > 1:
+                assert layer_impl.supports_pcp, (
+                    "PCP requires attention impls' support, "
+                    f"but the impl {layer_impl.__class__.__name__} "
+                    "does not support PCP."
+                )
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -148,6 +148,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
+from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@ -4736,6 +4737,9 @@ class GPUModelRunner(
            attention_backend_list, kv_cache_config.kv_cache_groups
        )

+        # Check if attention backend supports PCP&DCP and related features.
+        check_attention_cp_compatibility(self.vllm_config)
+
        for i, attn_backend_map in enumerate(attention_backend_maps):
            self.attn_groups.append(create_attn_groups(attn_backend_map, i))

@ -5394,20 +5398,6 @@ class GPUModelRunner(
                kv_transfer_group.register_kv_caches(kv_caches)
            kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)

-        if self.dcp_world_size > 1:
-            layer_type = cast(type[Any], AttentionLayerBase)
-            layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
-            for layer in layers.values():
-                layer_impl = getattr(layer, "impl", None)
-                if layer_impl is None:
-                    continue
-                assert layer_impl.need_to_return_lse_for_decode, (
-                    "DCP requires attention impls to return"
-                    " the softmax lse for decode, but the impl "
-                    f"{layer_impl.__class__.__name__} "
-                    "does not return the softmax lse for decode."
-                )
-
    def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
        """
        Add encoder-only layers to the KV cache config.