diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 484f3986bb301..f53e8b0308853 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3021,16 +3021,20 @@ def _get_and_verify_max_len( f"User-specified max_model_len ({max_model_len}) is greater " f"than the derived max_model_len ({max_len_key}=" f"{derived_max_model_len} or model_max_length=" - f"{model_max_length} in model's config.json). This may lead " - "to incorrect model outputs or CUDA errors.") + f"{model_max_length} in model's config.json).") + warning = ( + "VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme " + "caution. If the model uses relative position encoding (RoPE), " + "positions exceeding derived_max_model_len lead to nan. If the " + "model uses absolute position encoding, positions exceeding " + "derived_max_model_len will cause a CUDA array out-of-bounds " + "error.") if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN: - logger.warning( - "%s Make sure the value is correct and within the " - "model context size.", msg) + logger.warning_once("%s %s", msg, warning) else: raise ValueError( f"{msg} To allow overriding this maximum, set " - "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1") + f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}") return int(max_model_len)