vllm/benchmarks/disagg_benchmarks/zmq/test_connect_server2.py

import time
import zmq
import zmq.asyncio
import random
import asyncio


async def worker_routine(worker_url: str,
                   context: zmq.asyncio.Context = None, i: int = 0):
    """Worker routine"""
    # Socket to talk to dispatcher
    socket = context.socket(zmq.DEALER)
    socket.setsockopt(zmq.IDENTITY, f"worker-{i}-{time.time()}".encode())
    socket.connect(worker_url)
    print(f"worker-{i} {worker_url} started")
    while True:
        identity, url, headers, string  = await socket.recv_multipart()
        print(f"worker-{i} Received request identity: [{identity} ]")
        print(f"worker-{i} Received request url: [{url} ]")
        print(f"worker-{i} Received request headers: [{headers} ]")
        print(f"worker-{i} Received request string: [{string} ]")
        streamreply = ['{"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-4o-mini", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}',
'{"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-4o-mini", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{"content":"Hello"},"logprobs":null,"finish_reason":null}]}',
'{"id":"chatcmpl-123","object":"chat.completion.chunk","created":1694268190,"model":"gpt-4o-mini", "system_fingerprint": "fp_44709d6fcb", "choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]}'
]
        for j in range(len(streamreply)):
            # Do some 'work'
            # time.sleep(random.randint(1, 5))
            await asyncio.sleep(random.randint(1, 5))
            # Send reply back to client
            reply = f"worker-{i} reply part-{j} {string} \n {streamreply[j]}"
            # reply = streamreply[j]
            print(f"worker-{i} Sent reply: [{identity} {reply} ]")
            await socket.send_multipart([identity, reply.encode()])

async def main():
    """Server routine"""

    url_worker = "inproc://workers"
    url_client = "tcp://localhost:5556"

    # Prepare our context and sockets
    context = zmq.asyncio.Context()

    # Socket to talk to clients
    clients = context.socket(zmq.ROUTER)
    clients.bind(url_client)
    print("Server ROUTER started at", url_client)
    # Socket to talk to workers
    workers = context.socket(zmq.DEALER)
    workers.bind(url_worker)
    print("Worker DEALER started at", url_worker)

    tasks = [asyncio.create_task(worker_routine(url_worker, context, i)) for i in range(5)]
    proxy_task =  asyncio.to_thread(zmq.proxy, clients, workers)

    try:
        await asyncio.gather(*tasks, proxy_task)
    except KeyboardInterrupt:
        print("Server interrupted")
    except zmq.ZMQError as e:
        print("ZMQError:", e)
    finally:
        # We never get here but clean up anyhow
        clients.close()
        workers.close()
        context.term()


if __name__ == "__main__":
    asyncio.run(main())