From 6d7d95a70a4308699ad60fb5bcc08904135086f2 Mon Sep 17 00:00:00 2001 From: Zhangir Azerbayev <59542043+zhangir-azerbayev@users.noreply.github.com> Date: Sun, 16 Jul 2023 02:11:02 -0400 Subject: [PATCH] Offload port selection to OS (#467) --- vllm/engine/ray_utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index 66ca2b725efa7..01faf0d59035a 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -1,4 +1,4 @@ -import random +import socket from typing import List, Optional, Tuple try: @@ -12,6 +12,12 @@ from vllm.config import ParallelConfig DeviceID = Tuple[int, Optional[str], int] +def get_open_port(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + def initialize_cluster( parallel_config: ParallelConfig, engine_use_ray: bool = False, @@ -42,7 +48,7 @@ def initialize_cluster( if not parallel_config.worker_use_ray: # Initialize cluster locally. - port = random.randint(10000, 20000) + port = get_open_port() # We need to setup the distributed init method to make sure # the distributed megatron code (e.g., get world size) works correctly. distributed_init_method = f"tcp://localhost:{port}" @@ -96,7 +102,7 @@ def initialize_cluster( stage_devices.append((rank, node_resource, current_device_id)) if distributed_init_method is None: ip = node_resource.split("node:")[-1] - port = random.randint(10000, 20000) + port = get_open_port() distributed_init_method = f"tcp://{ip}:{port}" rank += 1 current_device_id += 1