diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 33d8168f8a13c..63ee8f50825c5 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -18,11 +18,6 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad logger = init_logger(__name__) -# Error string(s) for encoder/decoder -# unsupported attention scenarios -STR_NOT_IMPL_ENC_DEC_ROCM_HIP = ("ROCm/HIP is not currently supported " - "with encoder/decoder models.") - PAD_SLOT_ID = -1 # Switch to numpy implementation of compute_slot_mapping diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 834ec9b1d30b4..b74b746a35830 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -88,64 +88,6 @@ DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 -# Exception strings for non-implemented encoder/decoder scenarios - -# Reminder: Please update docs/features/compatibility_matrix.md -# If the feature combo become valid - -STR_NOT_IMPL_ENC_DEC_SWA = \ - "Sliding window attention for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ - "Prefix caching for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL = \ - "Chunked prefill for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = ( - "Models with logits_soft_cap " - "require FlashInfer backend, which is " - "currently not supported for encoder/decoder " - "models.") - -STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is not currently " - "supported with encoder/decoder " - "models.") - -STR_NOT_IMPL_ENC_DEC_PP = ("Pipeline parallelism is not " - "currently supported with " - "encoder/decoder models.") - -STR_NOT_IMPL_ENC_DEC_MM = ("Multimodal is not currently " - "supported with encoder/decoder " - "models.") - -STR_NOT_IMPL_ENC_DEC_SPEC_DEC = ("Speculative decoding is not " - "currently supported with encoder/" - "decoder models.") - -STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only " - "backends currently supported with encoder/" - "decoder models.") - -# Efficiently import all enc/dec error strings -# rather than having to import all of the above -STR_NOT_IMPL_ENC_DEC_ERR_STRS = { - "STR_NOT_IMPL_ENC_DEC_SWA": STR_NOT_IMPL_ENC_DEC_SWA, - "STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE": STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, - "STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL": - STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL, - "STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP": STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP, - "STR_NOT_IMPL_ENC_DEC_LORA": STR_NOT_IMPL_ENC_DEC_LORA, - "STR_NOT_IMPL_ENC_DEC_PP": STR_NOT_IMPL_ENC_DEC_PP, - "STR_NOT_IMPL_ENC_DEC_MM": STR_NOT_IMPL_ENC_DEC_MM, - "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC, - "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND, -} - # Constants related to forcing the attention backend selection # String name of register which may be set in order to