mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 13:15:48 +08:00
[BugFix] Graceful handling of torch symm mem errors. (#27671)
Signed-off-by: ilmarkov <markovilya197@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
parent
d23539549a
commit
1788aa1efb
@ -88,13 +88,21 @@ class SymmMemCommunicator:
|
||||
self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
|
||||
self.world_size
|
||||
]
|
||||
|
||||
self.buffer = torch_symm_mem.empty(
|
||||
self.max_size // self.dtype.itemsize,
|
||||
device=self.device,
|
||||
dtype=self.dtype,
|
||||
)
|
||||
handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
|
||||
try:
|
||||
self.buffer = torch_symm_mem.empty(
|
||||
self.max_size // self.dtype.itemsize,
|
||||
device=self.device,
|
||||
dtype=self.dtype,
|
||||
)
|
||||
handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
|
||||
except RuntimeError as e:
|
||||
logger.warning_once(
|
||||
"SymmMemCommunicator: symmetric memory initialization failed: %s "
|
||||
"Communicator is not available. To suppress this warning set "
|
||||
"VLLM_ALLREDUCE_USE_SYMM_MEM=0",
|
||||
str(e),
|
||||
)
|
||||
return
|
||||
if handle.multicast_ptr == 0:
|
||||
logger.warning(
|
||||
"SymmMemCommunicator: symmetric memory "
|
||||
|
||||
@ -201,7 +201,7 @@ if TYPE_CHECKING:
|
||||
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
|
||||
VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
|
||||
VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
|
||||
VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
|
||||
VLLM_TUNED_CONFIG_FOLDER: str | None = None
|
||||
VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set()
|
||||
VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
|
||||
@ -1389,7 +1389,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
),
|
||||
# Whether to use pytorch symmetric memory for allreduce
|
||||
"VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool(
|
||||
int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "0"))
|
||||
int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1"))
|
||||
),
|
||||
# Allows vllm to find tuned config under customized folder
|
||||
"VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user