mm: guard against double pin and unpin explicitly (#10672)

As commented, if you let cuda be the one to detect double pin/unpinning
it actually creates an asyc GPU error.
This commit is contained in:
rattus 2025-11-07 12:20:48 +10:00 committed by GitHub
parent eb1c42f649
commit cf97b033ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1103,6 +1103,12 @@ def pin_memory(tensor):
if not is_device_cpu(tensor.device):
return False
if tensor.is_pinned():
#NOTE: Cuda does detect when a tensor is already pinned and would
#error below, but there are proven cases where this also queues an error
#on the GPU async. So dont trust the CUDA API and guard here
return False
size = tensor.numel() * tensor.element_size()
if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY:
return False
@ -1123,6 +1129,12 @@ def unpin_memory(tensor):
if not is_device_cpu(tensor.device):
return False
if not tensor.is_pinned():
#NOTE: Cuda does detect when a tensor is already pinned and would
#error below, but there are proven cases where this also queues an error
#on the GPU async. So dont trust the CUDA API and guard here
return False
ptr = tensor.data_ptr()
if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr)