mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 01:45:01 +08:00
[BugFix] Harden distributed DP startup (#21538)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
parent
d84b97a3e3
commit
881e1af43a
@ -2794,6 +2794,9 @@ def make_zmq_socket(
|
|||||||
if linger is not None:
|
if linger is not None:
|
||||||
socket.setsockopt(zmq.LINGER, linger)
|
socket.setsockopt(zmq.LINGER, linger)
|
||||||
|
|
||||||
|
if socket_type == zmq.XPUB:
|
||||||
|
socket.setsockopt(zmq.XPUB_VERBOSE, True)
|
||||||
|
|
||||||
# Determine if the path is a TCP socket with an IPv6 address.
|
# Determine if the path is a TCP socket with an IPv6 address.
|
||||||
# Enable IPv6 on the zmq socket if so.
|
# Enable IPv6 on the zmq socket if so.
|
||||||
scheme, host, _ = split_zmq_path(path)
|
scheme, host, _ = split_zmq_path(path)
|
||||||
|
|||||||
@ -172,6 +172,18 @@ class DPCoordinatorProc:
|
|||||||
bind=True,
|
bind=True,
|
||||||
) as publish_back:
|
) as publish_back:
|
||||||
|
|
||||||
|
# Wait until all engines subscribe.
|
||||||
|
for _ in self.engines:
|
||||||
|
if publish_back.recv() != b'\x01':
|
||||||
|
logger.error(
|
||||||
|
"DP Coordinator received unexpected message while "
|
||||||
|
"waiting for engines to subscribe")
|
||||||
|
return
|
||||||
|
# Send ready message to engines.
|
||||||
|
publish_back.send(b"READY")
|
||||||
|
|
||||||
|
logger.info("All engine subscriptions received by DP coordinator")
|
||||||
|
|
||||||
poller = zmq.Poller()
|
poller = zmq.Poller()
|
||||||
poller.register(publish_front, zmq.POLLIN)
|
poller.register(publish_front, zmq.POLLIN)
|
||||||
poller.register(output_back, zmq.POLLIN)
|
poller.register(output_back, zmq.POLLIN)
|
||||||
|
|||||||
@ -461,8 +461,11 @@ class EngineCoreProc(EngineCore):
|
|||||||
self.has_coordinator = addresses.coordinator_output is not None
|
self.has_coordinator = addresses.coordinator_output is not None
|
||||||
self.frontend_stats_publish_address = (
|
self.frontend_stats_publish_address = (
|
||||||
addresses.frontend_stats_publish_address)
|
addresses.frontend_stats_publish_address)
|
||||||
|
logger.debug("Has DP Coordinator: %s, stats publish address: %s",
|
||||||
|
self.has_coordinator,
|
||||||
|
self.frontend_stats_publish_address)
|
||||||
# Only publish request queue stats to coordinator for "internal"
|
# Only publish request queue stats to coordinator for "internal"
|
||||||
# LB mode.
|
# and "hybrid" LB modes .
|
||||||
self.publish_dp_lb_stats = (
|
self.publish_dp_lb_stats = (
|
||||||
self.has_coordinator
|
self.has_coordinator
|
||||||
and not vllm_config.parallel_config.data_parallel_external_lb)
|
and not vllm_config.parallel_config.data_parallel_external_lb)
|
||||||
@ -472,25 +475,38 @@ class EngineCoreProc(EngineCore):
|
|||||||
super().__init__(vllm_config, executor_class, log_stats,
|
super().__init__(vllm_config, executor_class, log_stats,
|
||||||
executor_fail_callback)
|
executor_fail_callback)
|
||||||
|
|
||||||
|
# Background Threads and Queues for IO. These enable us to
|
||||||
|
# overlap ZMQ socket IO with GPU since they release the GIL,
|
||||||
|
# and to overlap some serialization/deserialization with the
|
||||||
|
# model forward pass.
|
||||||
|
# Threads handle Socket <-> Queues and core_busy_loop uses Queue.
|
||||||
|
ready_event = threading.Event()
|
||||||
|
input_thread = threading.Thread(target=self.process_input_sockets,
|
||||||
|
args=(addresses.inputs,
|
||||||
|
addresses.coordinator_input,
|
||||||
|
identity, ready_event),
|
||||||
|
daemon=True)
|
||||||
|
input_thread.start()
|
||||||
|
|
||||||
|
self.output_thread = threading.Thread(
|
||||||
|
target=self.process_output_sockets,
|
||||||
|
args=(addresses.outputs, addresses.coordinator_output,
|
||||||
|
self.engine_index),
|
||||||
|
daemon=True)
|
||||||
|
self.output_thread.start()
|
||||||
|
|
||||||
|
# Don't complete handshake until DP coordinator ready message is
|
||||||
|
# received.
|
||||||
|
while not ready_event.wait(timeout=10):
|
||||||
|
if not input_thread.is_alive():
|
||||||
|
raise RuntimeError(
|
||||||
|
"Input socket thread died during startup")
|
||||||
|
assert addresses.coordinator_input is not None
|
||||||
|
logger.info("Waiting for READY message from DP Coordinator...")
|
||||||
|
|
||||||
self.step_fn = (self.step if self.batch_queue is None else
|
self.step_fn = (self.step if self.batch_queue is None else
|
||||||
self.step_with_batch_queue)
|
self.step_with_batch_queue)
|
||||||
|
|
||||||
# Background Threads and Queues for IO. These enable us to
|
|
||||||
# overlap ZMQ socket IO with GPU since they release the GIL,
|
|
||||||
# and to overlap some serialization/deserialization with the
|
|
||||||
# model forward pass.
|
|
||||||
# Threads handle Socket <-> Queues and core_busy_loop uses Queue.
|
|
||||||
threading.Thread(target=self.process_input_sockets,
|
|
||||||
args=(addresses.inputs, addresses.coordinator_input,
|
|
||||||
identity),
|
|
||||||
daemon=True).start()
|
|
||||||
self.output_thread = threading.Thread(
|
|
||||||
target=self.process_output_sockets,
|
|
||||||
args=(addresses.outputs, addresses.coordinator_output,
|
|
||||||
self.engine_index),
|
|
||||||
daemon=True)
|
|
||||||
self.output_thread.start()
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def _perform_handshakes(
|
def _perform_handshakes(
|
||||||
self,
|
self,
|
||||||
@ -505,10 +521,10 @@ class EngineCoreProc(EngineCore):
|
|||||||
|
|
||||||
For DP=1 or offline mode, this is with the colocated front-end process.
|
For DP=1 or offline mode, this is with the colocated front-end process.
|
||||||
|
|
||||||
For DP>1 with internal loadbalancing this is with the shared front-end
|
For DP>1 with internal load-balancing this is with the shared front-end
|
||||||
process which may reside on a different node.
|
process which may reside on a different node.
|
||||||
|
|
||||||
For DP>1 with external or hybrid loadbalancing, two handshakes are
|
For DP>1 with external or hybrid load-balancing, two handshakes are
|
||||||
performed:
|
performed:
|
||||||
- With the rank 0 front-end process which retrieves the
|
- With the rank 0 front-end process which retrieves the
|
||||||
DP Coordinator ZMQ addresses and DP process group address.
|
DP Coordinator ZMQ addresses and DP process group address.
|
||||||
@ -772,7 +788,7 @@ class EngineCoreProc(EngineCore):
|
|||||||
|
|
||||||
def process_input_sockets(self, input_addresses: list[str],
|
def process_input_sockets(self, input_addresses: list[str],
|
||||||
coord_input_address: Optional[str],
|
coord_input_address: Optional[str],
|
||||||
identity: bytes):
|
identity: bytes, ready_event: threading.Event):
|
||||||
"""Input socket IO thread."""
|
"""Input socket IO thread."""
|
||||||
|
|
||||||
# Msgpack serialization decoding.
|
# Msgpack serialization decoding.
|
||||||
@ -809,9 +825,14 @@ class EngineCoreProc(EngineCore):
|
|||||||
# back to us.
|
# back to us.
|
||||||
input_socket.send(b'')
|
input_socket.send(b'')
|
||||||
poller.register(input_socket, zmq.POLLIN)
|
poller.register(input_socket, zmq.POLLIN)
|
||||||
|
|
||||||
if coord_socket is not None:
|
if coord_socket is not None:
|
||||||
|
# Wait for ready message from coordinator.
|
||||||
|
assert coord_socket.recv() == b"READY"
|
||||||
poller.register(coord_socket, zmq.POLLIN)
|
poller.register(coord_socket, zmq.POLLIN)
|
||||||
|
|
||||||
|
ready_event.set()
|
||||||
|
del ready_event
|
||||||
while True:
|
while True:
|
||||||
for input_socket, _ in poller.poll():
|
for input_socket, _ in poller.poll():
|
||||||
# (RequestType, RequestData)
|
# (RequestType, RequestData)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user