From 3c8358c328ab67692b88c6ff33c90229b1e295ed Mon Sep 17 00:00:00 2001 From: yurekami <249254018+yurekami@users.noreply.github.com> Date: Thu, 18 Dec 2025 15:58:13 +0900 Subject: [PATCH 1/3] [v1][CP] Improve DCP/PCP/MTP error messages with actionable guidance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace cryptic AssertionErrors with informative RuntimeErrors that: - Explain what DCP (Decode Context Parallel) and PCP (Prefill Context Parallel) are - List compatible attention backends - Provide environment variable instructions (VLLM_ATTENTION_BACKEND) - Include documentation links Fixes #28407 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 Signed-off-by: yurekami <249254018+yurekami@users.noreply.github.com> --- vllm/v1/worker/cp_utils.py | 66 +++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/vllm/v1/worker/cp_utils.py b/vllm/v1/worker/cp_utils.py index f666c739b0be7..2627187290ff3 100644 --- a/vllm/v1/worker/cp_utils.py +++ b/vllm/v1/worker/cp_utils.py @@ -21,22 +21,56 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None: layer_impl = getattr(layer, "impl", None) if layer_impl is None: continue - if vllm_config.speculative_config is not None and interleave_size > 1: - assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, ( - "MTP with cp_kv_cache_interleave_size > 1 is not " - f"supported in {layer_impl.__class__.__name__}." - ) - if dcp_size > 1: - assert layer_impl.need_to_return_lse_for_decode, ( - "DCP requires attention impls to return" - " the softmax lse for decode, but the impl " - f"{layer_impl.__class__.__name__} " - "does not return the softmax lse for decode." + + supports_mtp = layer_impl.supports_mtp_with_cp_non_trivial_interleave_size + if ( + vllm_config.speculative_config is not None + and interleave_size > 1 + and not supports_mtp + ): + raise RuntimeError( + f"Multi-Token Prediction (MTP) with " + f"cp_kv_cache_interleave_size > 1 is not supported by the " + f"current attention backend " + f"'{layer_impl.__class__.__name__}'.\n\n" + f"To resolve this issue, try one of the following:\n" + f" 1. Use a different attention backend by setting:\n" + f" export VLLM_ATTENTION_BACKEND=\n" + f" 2. Set cp_kv_cache_interleave_size to 1\n" + f" 3. Disable speculative decoding" ) - if pcp_size > 1: - assert layer_impl.supports_pcp, ( - "PCP requires attention impls' support, " - f"but the impl {layer_impl.__class__.__name__} " - "does not support PCP." + if dcp_size > 1 and not layer_impl.need_to_return_lse_for_decode: + raise RuntimeError( + f"Decode Context Parallel (DCP) requires an attention " + f"backend that supports returning softmax LSE (log-sum-exp) " + f"for decode operations. The current backend " + f"'{layer_impl.__class__.__name__}' does not support this " + f"feature.\n\n" + f"To resolve this issue, try one of the following:\n" + f" 1. Use a compatible attention backend by setting:\n" + f" export VLLM_ATTENTION_BACKEND=\n" + f" Compatible backends: FLASH_ATTN, FLASHINFER, " + f"TRITON_MLA, FLASH_MLA, FLASH_ATTN_MLA, CUTLASS_MLA\n" + f" 2. Disable DCP by removing the " + f"--decode-context-parallel-size flag\n\n" + f"For more information, see:\n" + f" https://docs.vllm.ai/en/latest/serving/" + f"distributed_serving.html" + ) + + if pcp_size > 1 and not layer_impl.supports_pcp: + raise RuntimeError( + f"Prefill Context Parallel (PCP) requires an attention " + f"backend that supports PCP. The current backend " + f"'{layer_impl.__class__.__name__}' does not support this " + f"feature.\n\n" + f"To resolve this issue, try one of the following:\n" + f" 1. Use a compatible attention backend by setting:\n" + f" export VLLM_ATTENTION_BACKEND=\n" + f" 2. Disable PCP by removing the " + f"--prefill-context-parallel-size flag\n\n" + f"For more information, see:\n" + f" https://docs.vllm.ai/en/latest/serving/" + f"distributed_serving.html" ) From 79e0db60ee15933fc5afa9486a57489c894ed20f Mon Sep 17 00:00:00 2001 From: yurekami Date: Wed, 24 Dec 2025 15:16:10 +0900 Subject: [PATCH 2/3] Use --attention-backend flag instead of VLLM_ATTENTION_BACKEND env var MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per reviewer feedback, the VLLM_ATTENTION_BACKEND environment variable is being deprecated in favor of the --attention-backend CLI flag. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 Signed-off-by: yurekami --- vllm/v1/worker/cp_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/cp_utils.py b/vllm/v1/worker/cp_utils.py index 2627187290ff3..2e3c93c4ca24a 100644 --- a/vllm/v1/worker/cp_utils.py +++ b/vllm/v1/worker/cp_utils.py @@ -34,8 +34,8 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None: f"current attention backend " f"'{layer_impl.__class__.__name__}'.\n\n" f"To resolve this issue, try one of the following:\n" - f" 1. Use a different attention backend by setting:\n" - f" export VLLM_ATTENTION_BACKEND=\n" + f" 1. Use a different attention backend by specifying:\n" + f" --attention-backend \n" f" 2. Set cp_kv_cache_interleave_size to 1\n" f" 3. Disable speculative decoding" ) @@ -48,8 +48,8 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None: f"'{layer_impl.__class__.__name__}' does not support this " f"feature.\n\n" f"To resolve this issue, try one of the following:\n" - f" 1. Use a compatible attention backend by setting:\n" - f" export VLLM_ATTENTION_BACKEND=\n" + f" 1. Use a compatible attention backend by specifying:\n" + f" --attention-backend \n" f" Compatible backends: FLASH_ATTN, FLASHINFER, " f"TRITON_MLA, FLASH_MLA, FLASH_ATTN_MLA, CUTLASS_MLA\n" f" 2. Disable DCP by removing the " @@ -66,8 +66,8 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None: f"'{layer_impl.__class__.__name__}' does not support this " f"feature.\n\n" f"To resolve this issue, try one of the following:\n" - f" 1. Use a compatible attention backend by setting:\n" - f" export VLLM_ATTENTION_BACKEND=\n" + f" 1. Use a compatible attention backend by specifying:\n" + f" --attention-backend \n" f" 2. Disable PCP by removing the " f"--prefill-context-parallel-size flag\n\n" f"For more information, see:\n" From 4b7df5710a6f1001b0c85ecbef85af6d10e45ca5 Mon Sep 17 00:00:00 2001 From: yurekami Date: Thu, 25 Dec 2025 04:08:54 +0900 Subject: [PATCH 3/3] Add compatibility notes and docs links to MTP/PCP error messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add documentation links to MTP and PCP error messages for consistency with DCP error message - Add notes indicating no backends currently support these features - Remove suggestion to use --attention-backend for PCP since no backends support it yet 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 Signed-off-by: yurekami --- vllm/v1/worker/cp_utils.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/cp_utils.py b/vllm/v1/worker/cp_utils.py index 2e3c93c4ca24a..7625b1b5f8951 100644 --- a/vllm/v1/worker/cp_utils.py +++ b/vllm/v1/worker/cp_utils.py @@ -34,10 +34,13 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None: f"current attention backend " f"'{layer_impl.__class__.__name__}'.\n\n" f"To resolve this issue, try one of the following:\n" - f" 1. Use a different attention backend by specifying:\n" - f" --attention-backend \n" - f" 2. Set cp_kv_cache_interleave_size to 1\n" - f" 3. Disable speculative decoding" + f" 1. Set cp_kv_cache_interleave_size to 1\n" + f" 2. Disable speculative decoding\n\n" + f"Note: No backends currently support MTP with " + f"cp_kv_cache_interleave_size > 1.\n\n" + f"For more information, see:\n" + f" https://docs.vllm.ai/en/latest/serving/" + f"distributed_serving.html" ) if dcp_size > 1 and not layer_impl.need_to_return_lse_for_decode: @@ -65,11 +68,10 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None: f"backend that supports PCP. The current backend " f"'{layer_impl.__class__.__name__}' does not support this " f"feature.\n\n" - f"To resolve this issue, try one of the following:\n" - f" 1. Use a compatible attention backend by specifying:\n" - f" --attention-backend \n" - f" 2. Disable PCP by removing the " + f"To resolve this issue:\n" + f" Disable PCP by removing the " f"--prefill-context-parallel-size flag\n\n" + f"Note: No backends currently support PCP.\n\n" f"For more information, see:\n" f" https://docs.vllm.ai/en/latest/serving/" f"distributed_serving.html"