[Frontend] Update the warning log when using VLLM_ALLOW_LONG_MAX_MODEL_LEN (#20904)

Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
wang.yuqi 2025-09-01 16:50:25 +08:00 committed by GitHub
parent d7fbc6ddac
commit 55602bb2e6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -3021,16 +3021,20 @@ def _get_and_verify_max_len(
f"User-specified max_model_len ({max_model_len}) is greater "
f"than the derived max_model_len ({max_len_key}="
f"{derived_max_model_len} or model_max_length="
f"{model_max_length} in model's config.json). This may lead "
"to incorrect model outputs or CUDA errors.")
f"{model_max_length} in model's config.json).")
warning = (
"VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme "
"caution. If the model uses relative position encoding (RoPE), "
"positions exceeding derived_max_model_len lead to nan. If the "
"model uses absolute position encoding, positions exceeding "
"derived_max_model_len will cause a CUDA array out-of-bounds "
"error.")
if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
logger.warning(
"%s Make sure the value is correct and within the "
"model context size.", msg)
logger.warning_once("%s %s", msg, warning)
else:
raise ValueError(
f"{msg} To allow overriding this maximum, set "
"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1")
f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}")
return int(max_model_len)