mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 14:45:45 +08:00
[UX] Raise error for attn backend of batch invariant (#29348)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
c17610e2ba
commit
699bca76c0
@ -812,19 +812,19 @@ def override_envs_for_invariance():
|
|||||||
# "TRITON_MLA",
|
# "TRITON_MLA",
|
||||||
]
|
]
|
||||||
if curr_attn_backend not in supported_backends:
|
if curr_attn_backend not in supported_backends:
|
||||||
warning = (
|
error = (
|
||||||
"Forcibly updating attention backend to"
|
"VLLM batch_invariant mode requires an attention backend in "
|
||||||
f" {supported_backends[0]} for batch_invariant. "
|
f"{supported_backends}, but got '{curr_attn_backend}'. "
|
||||||
f" Supported backends: {supported_backends}."
|
"Please set the 'VLLM_ATTENTION_BACKEND' environment variable "
|
||||||
|
"to one of the supported backends before enabling batch_invariant."
|
||||||
)
|
)
|
||||||
logger.warning_once(warning)
|
raise RuntimeError(error)
|
||||||
os.environ["VLLM_ATTENTION_BACKEND"] = supported_backends[0]
|
|
||||||
if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
|
if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
|
||||||
warning = (
|
warning = (
|
||||||
"You are using a decode-invariant form of batch invariance. "
|
"You are using a decode-invariant form of batch invariance. "
|
||||||
"This will not be invariant between prefill and decode."
|
"This will not be invariant between prefill and decode."
|
||||||
)
|
)
|
||||||
logger.warning_once(warning)
|
logger.warning_once(warning, scope="local")
|
||||||
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
|
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
|
||||||
|
|
||||||
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
|
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user