Merge 4b7df5710a6f1001b0c85ecbef85af6d10e45ca5 into 254f6b986720c92ddf97fbb1a6a6465da8e87e29

This commit is contained in:
ゆり 2025-12-25 00:07:18 +00:00 committed by GitHub
commit 5772e041c2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -21,22 +21,58 @@ def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
layer_impl = getattr(layer, "impl", None)
if layer_impl is None:
continue
if vllm_config.speculative_config is not None and interleave_size > 1:
assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, (
"MTP with cp_kv_cache_interleave_size > 1 is not "
f"supported in {layer_impl.__class__.__name__}."
)
if dcp_size > 1:
assert layer_impl.need_to_return_lse_for_decode, (
"DCP requires attention impls to return"
" the softmax lse for decode, but the impl "
f"{layer_impl.__class__.__name__} "
"does not return the softmax lse for decode."
supports_mtp = layer_impl.supports_mtp_with_cp_non_trivial_interleave_size
if (
vllm_config.speculative_config is not None
and interleave_size > 1
and not supports_mtp
):
raise RuntimeError(
f"Multi-Token Prediction (MTP) with "
f"cp_kv_cache_interleave_size > 1 is not supported by the "
f"current attention backend "
f"'{layer_impl.__class__.__name__}'.\n\n"
f"To resolve this issue, try one of the following:\n"
f" 1. Set cp_kv_cache_interleave_size to 1\n"
f" 2. Disable speculative decoding\n\n"
f"Note: No backends currently support MTP with "
f"cp_kv_cache_interleave_size > 1.\n\n"
f"For more information, see:\n"
f" https://docs.vllm.ai/en/latest/serving/"
f"distributed_serving.html"
)
if pcp_size > 1:
assert layer_impl.supports_pcp, (
"PCP requires attention impls' support, "
f"but the impl {layer_impl.__class__.__name__} "
"does not support PCP."
if dcp_size > 1 and not layer_impl.need_to_return_lse_for_decode:
raise RuntimeError(
f"Decode Context Parallel (DCP) requires an attention "
f"backend that supports returning softmax LSE (log-sum-exp) "
f"for decode operations. The current backend "
f"'{layer_impl.__class__.__name__}' does not support this "
f"feature.\n\n"
f"To resolve this issue, try one of the following:\n"
f" 1. Use a compatible attention backend by specifying:\n"
f" --attention-backend <backend>\n"
f" Compatible backends: FLASH_ATTN, FLASHINFER, "
f"TRITON_MLA, FLASH_MLA, FLASH_ATTN_MLA, CUTLASS_MLA\n"
f" 2. Disable DCP by removing the "
f"--decode-context-parallel-size flag\n\n"
f"For more information, see:\n"
f" https://docs.vllm.ai/en/latest/serving/"
f"distributed_serving.html"
)
if pcp_size > 1 and not layer_impl.supports_pcp:
raise RuntimeError(
f"Prefill Context Parallel (PCP) requires an attention "
f"backend that supports PCP. The current backend "
f"'{layer_impl.__class__.__name__}' does not support this "
f"feature.\n\n"
f"To resolve this issue:\n"
f" Disable PCP by removing the "
f"--prefill-context-parallel-size flag\n\n"
f"Note: No backends currently support PCP.\n\n"
f"For more information, see:\n"
f" https://docs.vllm.ai/en/latest/serving/"
f"distributed_serving.html"
)