Offload port selection to OS (#467)

2025-12-16 10:16:23 +08:00 · 2023-07-16 02:11:02 -04:00 · 2023-07-16 02:11:02 -04:00 · 6d7d95a70a
commit 6d7d95a70a
parent 96853af5a8
1 changed files with 9 additions and 3 deletions
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@ -1,4 +1,4 @@
-import random
+import socket
 from typing import List, Optional, Tuple

 try:
@ -12,6 +12,12 @@ from vllm.config import ParallelConfig
 DeviceID = Tuple[int, Optional[str], int]


+def get_open_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
+
+
 def initialize_cluster(
    parallel_config: ParallelConfig,
    engine_use_ray: bool = False,
@ -42,7 +48,7 @@ def initialize_cluster(

    if not parallel_config.worker_use_ray:
        # Initialize cluster locally.
-        port = random.randint(10000, 20000)
+        port = get_open_port()
        # We need to setup the distributed init method to make sure
        # the distributed megatron code (e.g., get world size) works correctly.
        distributed_init_method = f"tcp://localhost:{port}"
@ -96,7 +102,7 @@ def initialize_cluster(
            stage_devices.append((rank, node_resource, current_device_id))
            if distributed_init_method is None:
                ip = node_resource.split("node:")[-1]
-                port = random.randint(10000, 20000)
+                port = get_open_port()
                distributed_init_method = f"tcp://{ip}:{port}"
            rank += 1
            current_device_id += 1