Offload port selection to OS (#467)

This commit is contained in:
Zhangir Azerbayev 2023-07-16 02:11:02 -04:00 committed by GitHub
parent 96853af5a8
commit 6d7d95a70a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,4 +1,4 @@
import random
import socket
from typing import List, Optional, Tuple
try:
@ -12,6 +12,12 @@ from vllm.config import ParallelConfig
DeviceID = Tuple[int, Optional[str], int]
def get_open_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("", 0))
return s.getsockname()[1]
def initialize_cluster(
parallel_config: ParallelConfig,
engine_use_ray: bool = False,
@ -42,7 +48,7 @@ def initialize_cluster(
if not parallel_config.worker_use_ray:
# Initialize cluster locally.
port = random.randint(10000, 20000)
port = get_open_port()
# We need to setup the distributed init method to make sure
# the distributed megatron code (e.g., get world size) works correctly.
distributed_init_method = f"tcp://localhost:{port}"
@ -96,7 +102,7 @@ def initialize_cluster(
stage_devices.append((rank, node_resource, current_device_id))
if distributed_init_method is None:
ip = node_resource.split("node:")[-1]
port = random.randint(10000, 20000)
port = get_open_port()
distributed_init_method = f"tcp://{ip}:{port}"
rank += 1
current_device_id += 1