From 3c8358c328ab67692b88c6ff33c90229b1e295ed Mon Sep 17 00:00:00 2001
From: yurekami <249254018+yurekami@users.noreply.github.com>
Date: Thu, 18 Dec 2025 15:58:13 +0900
Subject: [PATCH 1/3] [v1][CP] Improve DCP/PCP/MTP error messages with
 actionable guidance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace cryptic AssertionErrors with informative RuntimeErrors that:
- Explain what DCP (Decode Context Parallel) and PCP (Prefill Context
  Parallel) are
- List compatible attention backends
- Provide environment variable instructions (VLLM_ATTENTION_BACKEND)
- Include documentation links

Fixes #28407

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: yurekami <249254018+yurekami@users.noreply.github.com>
---
 vllm/v1/worker/cp_utils.py | 66 +++++++++++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 16 deletions(-)
diff --git a/vllm/v1/worker/cp_utils.py b/vllm/v1/worker/cp_utils.py
index f666c739b0be7..2627187290ff3 100644
--- a/vllm/v1/worker/cp_utils.py
+++ b/vllm/v1/worker/cp_utils.py
@@ -21,22 +21,56 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
             layer_impl = getattr(layer, "impl", None)
             if layer_impl is None:
                 continue
-            if vllm_config.speculative_config is not None and interleave_size > 1:
-                assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, (
-                    "MTP with cp_kv_cache_interleave_size > 1 is not "
-                    f"supported in {layer_impl.__class__.__name__}."
-                )
-            if dcp_size > 1:
-                assert layer_impl.need_to_return_lse_for_decode, (
-                    "DCP requires attention impls to return"
-                    " the softmax lse for decode, but the impl "
-                    f"{layer_impl.__class__.__name__} "
-                    "does not return the softmax lse for decode."
+
+            supports_mtp = layer_impl.supports_mtp_with_cp_non_trivial_interleave_size
+            if (
+                vllm_config.speculative_config is not None
+                and interleave_size > 1
+                and not supports_mtp
+            ):
+                raise RuntimeError(
+                    f"Multi-Token Prediction (MTP) with "
+                    f"cp_kv_cache_interleave_size > 1 is not supported by the "
+                    f"current attention backend "
+                    f"'{layer_impl.__class__.__name__}'.\n\n"
+                    f"To resolve this issue, try one of the following:\n"
+                    f"  1. Use a different attention backend by setting:\n"
+                    f"     export VLLM_ATTENTION_BACKEND=<backend>\n"
+                    f"  2. Set cp_kv_cache_interleave_size to 1\n"
+                    f"  3. Disable speculative decoding"
                 )
 
-            if pcp_size > 1:
-                assert layer_impl.supports_pcp, (
-                    "PCP requires attention impls' support, "
-                    f"but the impl {layer_impl.__class__.__name__} "
-                    "does not support PCP."
+            if dcp_size > 1 and not layer_impl.need_to_return_lse_for_decode:
+                raise RuntimeError(
+                    f"Decode Context Parallel (DCP) requires an attention "
+                    f"backend that supports returning softmax LSE (log-sum-exp) "
+                    f"for decode operations. The current backend "
+                    f"'{layer_impl.__class__.__name__}' does not support this "
+                    f"feature.\n\n"
+                    f"To resolve this issue, try one of the following:\n"
+                    f"  1. Use a compatible attention backend by setting:\n"
+                    f"     export VLLM_ATTENTION_BACKEND=<backend>\n"
+                    f"     Compatible backends: FLASH_ATTN, FLASHINFER, "
+                    f"TRITON_MLA, FLASH_MLA, FLASH_ATTN_MLA, CUTLASS_MLA\n"
+                    f"  2. Disable DCP by removing the "
+                    f"--decode-context-parallel-size flag\n\n"
+                    f"For more information, see:\n"
+                    f"  https://docs.vllm.ai/en/latest/serving/"
+                    f"distributed_serving.html"
+                )
+
+            if pcp_size > 1 and not layer_impl.supports_pcp:
+                raise RuntimeError(
+                    f"Prefill Context Parallel (PCP) requires an attention "
+                    f"backend that supports PCP. The current backend "
+                    f"'{layer_impl.__class__.__name__}' does not support this "
+                    f"feature.\n\n"
+                    f"To resolve this issue, try one of the following:\n"
+                    f"  1. Use a compatible attention backend by setting:\n"
+                    f"     export VLLM_ATTENTION_BACKEND=<backend>\n"
+                    f"  2. Disable PCP by removing the "
+                    f"--prefill-context-parallel-size flag\n\n"
+                    f"For more information, see:\n"
+                    f"  https://docs.vllm.ai/en/latest/serving/"
+                    f"distributed_serving.html"
                 )

From 79e0db60ee15933fc5afa9486a57489c894ed20f Mon Sep 17 00:00:00 2001
From: yurekami <yurekami@users.noreply.github.com>
Date: Wed, 24 Dec 2025 15:16:10 +0900
Subject: [PATCH 2/3] Use --attention-backend flag instead of
 VLLM_ATTENTION_BACKEND env var
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per reviewer feedback, the VLLM_ATTENTION_BACKEND environment variable
is being deprecated in favor of the --attention-backend CLI flag.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: yurekami <yurekami@users.noreply.github.com>
---
 vllm/v1/worker/cp_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/cp_utils.py b/vllm/v1/worker/cp_utils.py
index 2627187290ff3..2e3c93c4ca24a 100644
--- a/vllm/v1/worker/cp_utils.py
+++ b/vllm/v1/worker/cp_utils.py
@@ -34,8 +34,8 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
                     f"current attention backend "
                     f"'{layer_impl.__class__.__name__}'.\n\n"
                     f"To resolve this issue, try one of the following:\n"
-                    f"  1. Use a different attention backend by setting:\n"
-                    f"     export VLLM_ATTENTION_BACKEND=<backend>\n"
+                    f"  1. Use a different attention backend by specifying:\n"
+                    f"     --attention-backend <backend>\n"
                     f"  2. Set cp_kv_cache_interleave_size to 1\n"
                     f"  3. Disable speculative decoding"
                 )
@@ -48,8 +48,8 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
                     f"'{layer_impl.__class__.__name__}' does not support this "
                     f"feature.\n\n"
                     f"To resolve this issue, try one of the following:\n"
-                    f"  1. Use a compatible attention backend by setting:\n"
-                    f"     export VLLM_ATTENTION_BACKEND=<backend>\n"
+                    f"  1. Use a compatible attention backend by specifying:\n"
+                    f"     --attention-backend <backend>\n"
                     f"     Compatible backends: FLASH_ATTN, FLASHINFER, "
                     f"TRITON_MLA, FLASH_MLA, FLASH_ATTN_MLA, CUTLASS_MLA\n"
                     f"  2. Disable DCP by removing the "
@@ -66,8 +66,8 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
                     f"'{layer_impl.__class__.__name__}' does not support this "
                     f"feature.\n\n"
                     f"To resolve this issue, try one of the following:\n"
-                    f"  1. Use a compatible attention backend by setting:\n"
-                    f"     export VLLM_ATTENTION_BACKEND=<backend>\n"
+                    f"  1. Use a compatible attention backend by specifying:\n"
+                    f"     --attention-backend <backend>\n"
                     f"  2. Disable PCP by removing the "
                     f"--prefill-context-parallel-size flag\n\n"
                     f"For more information, see:\n"

From 4b7df5710a6f1001b0c85ecbef85af6d10e45ca5 Mon Sep 17 00:00:00 2001
From: yurekami <yurekami@users.noreply.github.com>
Date: Thu, 25 Dec 2025 04:08:54 +0900
Subject: [PATCH 3/3] Add compatibility notes and docs links to MTP/PCP error
 messages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add documentation links to MTP and PCP error messages for consistency
  with DCP error message
- Add notes indicating no backends currently support these features
- Remove suggestion to use --attention-backend for PCP since no
  backends support it yet

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: yurekami <yurekami@users.noreply.github.com>
---
 vllm/v1/worker/cp_utils.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/worker/cp_utils.py b/vllm/v1/worker/cp_utils.py
index 2e3c93c4ca24a..7625b1b5f8951 100644
--- a/vllm/v1/worker/cp_utils.py
+++ b/vllm/v1/worker/cp_utils.py
@@ -34,10 +34,13 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
                     f"current attention backend "
                     f"'{layer_impl.__class__.__name__}'.\n\n"
                     f"To resolve this issue, try one of the following:\n"
-                    f"  1. Use a different attention backend by specifying:\n"
-                    f"     --attention-backend <backend>\n"
-                    f"  2. Set cp_kv_cache_interleave_size to 1\n"
-                    f"  3. Disable speculative decoding"
+                    f"  1. Set cp_kv_cache_interleave_size to 1\n"
+                    f"  2. Disable speculative decoding\n\n"
+                    f"Note: No backends currently support MTP with "
+                    f"cp_kv_cache_interleave_size > 1.\n\n"
+                    f"For more information, see:\n"
+                    f"  https://docs.vllm.ai/en/latest/serving/"
+                    f"distributed_serving.html"
                 )
 
             if dcp_size > 1 and not layer_impl.need_to_return_lse_for_decode:
@@ -65,11 +68,10 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
                     f"backend that supports PCP. The current backend "
                     f"'{layer_impl.__class__.__name__}' does not support this "
                     f"feature.\n\n"
-                    f"To resolve this issue, try one of the following:\n"
-                    f"  1. Use a compatible attention backend by specifying:\n"
-                    f"     --attention-backend <backend>\n"
-                    f"  2. Disable PCP by removing the "
+                    f"To resolve this issue:\n"
+                    f"  Disable PCP by removing the "
                     f"--prefill-context-parallel-size flag\n\n"
+                    f"Note: No backends currently support PCP.\n\n"
                     f"For more information, see:\n"
                     f"  https://docs.vllm.ai/en/latest/serving/"
                     f"distributed_serving.html"