diff --git a/vllm/v1/worker/cp_utils.py b/vllm/v1/worker/cp_utils.py index 2e3c93c4ca24a..7625b1b5f8951 100644 --- a/vllm/v1/worker/cp_utils.py +++ b/vllm/v1/worker/cp_utils.py @@ -34,10 +34,13 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None: f"current attention backend " f"'{layer_impl.__class__.__name__}'.\n\n" f"To resolve this issue, try one of the following:\n" - f" 1. Use a different attention backend by specifying:\n" - f" --attention-backend \n" - f" 2. Set cp_kv_cache_interleave_size to 1\n" - f" 3. Disable speculative decoding" + f" 1. Set cp_kv_cache_interleave_size to 1\n" + f" 2. Disable speculative decoding\n\n" + f"Note: No backends currently support MTP with " + f"cp_kv_cache_interleave_size > 1.\n\n" + f"For more information, see:\n" + f" https://docs.vllm.ai/en/latest/serving/" + f"distributed_serving.html" ) if dcp_size > 1 and not layer_impl.need_to_return_lse_for_decode: @@ -65,11 +68,10 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None: f"backend that supports PCP. The current backend " f"'{layer_impl.__class__.__name__}' does not support this " f"feature.\n\n" - f"To resolve this issue, try one of the following:\n" - f" 1. Use a compatible attention backend by specifying:\n" - f" --attention-backend \n" - f" 2. Disable PCP by removing the " + f"To resolve this issue:\n" + f" Disable PCP by removing the " f"--prefill-context-parallel-size flag\n\n" + f"Note: No backends currently support PCP.\n\n" f"For more information, see:\n" f" https://docs.vllm.ai/en/latest/serving/" f"distributed_serving.html"