mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-24 15:27:06 +08:00
update lock
Signed-off-by: inkcherry <mingzhi.liu@amd.com>
This commit is contained in:
parent
374cc25e0f
commit
77321502e7
@ -520,7 +520,8 @@ class MoRIIOWriter:
|
|||||||
task.request_id, task.remote_ip, remote_port
|
task.request_id, task.remote_ip, remote_port
|
||||||
)
|
)
|
||||||
# mark request as done, then we can free the blocks
|
# mark request as done, then we can free the blocks
|
||||||
self.worker.moriio_wrapper.done_req_ids.append(task.request_id)
|
with self.worker.moriio_wrapper.lock:
|
||||||
|
self.worker.moriio_wrapper.done_req_ids.append(task.request_id)
|
||||||
del self.worker.moriio_wrapper.done_remote_allocate_req_dict[
|
del self.worker.moriio_wrapper.done_remote_allocate_req_dict[
|
||||||
task.request_id
|
task.request_id
|
||||||
]
|
]
|
||||||
@ -1559,7 +1560,6 @@ class MoRIIOConnectorWorker:
|
|||||||
|
|
||||||
retry_count = 0
|
retry_count = 0
|
||||||
index = 1
|
index = 1
|
||||||
should_break = True
|
|
||||||
with zmq_context.socket(zmq.DEALER) as sock:
|
with zmq_context.socket(zmq.DEALER) as sock:
|
||||||
sock.connect(f"tcp://{self.proxy_ip}:{self.proxy_ping_port}")
|
sock.connect(f"tcp://{self.proxy_ip}:{self.proxy_ping_port}")
|
||||||
|
|
||||||
@ -1598,18 +1598,17 @@ class MoRIIOConnectorWorker:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.info("Unexpected error when sending ping: %s", e)
|
logger.info("Unexpected error when sending ping: %s", e)
|
||||||
retry_count += 1
|
retry_count += 1
|
||||||
|
|
||||||
finally:
|
|
||||||
if retry_count >= MoRIIOConstants.MAX_PING_RETRIES:
|
if retry_count >= MoRIIOConstants.MAX_PING_RETRIES:
|
||||||
logger.error(
|
logger.error(
|
||||||
"Max retries (%s) exceeded. Stopping ping loop.",
|
"Max retries (%s) exceeded. Stopping ping loop.",
|
||||||
MoRIIOConstants.MAX_PING_RETRIES,
|
MoRIIOConstants.MAX_PING_RETRIES,
|
||||||
)
|
)
|
||||||
should_break = True
|
raise RuntimeError(f"Ping failed after {retry_count} retries")
|
||||||
|
|
||||||
|
finally:
|
||||||
time.sleep(MoRIIOConstants.PING_INTERVAL)
|
time.sleep(MoRIIOConstants.PING_INTERVAL)
|
||||||
index += 1
|
index += 1
|
||||||
if should_break:
|
|
||||||
break
|
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if hasattr(self, "_handshake_initiation_executor"):
|
if hasattr(self, "_handshake_initiation_executor"):
|
||||||
@ -1951,19 +1950,23 @@ class MoRIIOConnectorWorker:
|
|||||||
|
|
||||||
def _pop_done_transfers(self) -> set[str]:
|
def _pop_done_transfers(self) -> set[str]:
|
||||||
done_req_ids: set[str] = set()
|
done_req_ids: set[str] = set()
|
||||||
for req_id, status_list in self._recving_transfers.items():
|
with self.moriio_wrapper.lock:
|
||||||
if status_list[-1].Succeeded():
|
to_remove = []
|
||||||
done_req_ids.add(req_id)
|
for req_id, status_list in self._recving_transfers.items():
|
||||||
|
if status_list[-1].Succeeded():
|
||||||
|
done_req_ids.add(req_id)
|
||||||
|
|
||||||
self.moriio_wrapper.send_notify(
|
self.moriio_wrapper.send_notify(
|
||||||
req_id,
|
req_id,
|
||||||
self._recving_transfers_callback_addr[req_id][0],
|
self._recving_transfers_callback_addr[req_id][0],
|
||||||
self._recving_transfers_callback_addr[req_id][1],
|
self._recving_transfers_callback_addr[req_id][1],
|
||||||
)
|
)
|
||||||
|
to_remove.append(req_id)
|
||||||
|
for req_id in to_remove:
|
||||||
del self._recving_transfers[req_id]
|
del self._recving_transfers[req_id]
|
||||||
del self._recving_transfers_callback_addr[req_id]
|
del self._recving_transfers_callback_addr[req_id]
|
||||||
|
|
||||||
return done_req_ids
|
return done_req_ids
|
||||||
|
|
||||||
def save_kv_layer(
|
def save_kv_layer(
|
||||||
self,
|
self,
|
||||||
@ -2258,12 +2261,13 @@ class MoRIIOConnectorWorker:
|
|||||||
transfer_status = self.moriio_wrapper.read_remote_data(
|
transfer_status = self.moriio_wrapper.read_remote_data(
|
||||||
offs[2], offs[0], offs[1], sessions[sess_idx]
|
offs[2], offs[0], offs[1], sessions[sess_idx]
|
||||||
)
|
)
|
||||||
|
with self.moriio_wrapper.lock:
|
||||||
|
|
||||||
self._recving_transfers[request_id].append(transfer_status)
|
self._recving_transfers[request_id].append(transfer_status)
|
||||||
self._recving_transfers_callback_addr[request_id] = (
|
self._recving_transfers_callback_addr[request_id] = (
|
||||||
remote_host,
|
remote_host,
|
||||||
str(remote_notify_port + self.tp_rank),
|
str(remote_notify_port + self.tp_rank),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user