From 298298f97d82c7d84d2fba8193ac54ca3e23ac9d Mon Sep 17 00:00:00 2001
From: clark <panf2333@gmail.com>
Date: Mon, 20 Jan 2025 13:42:47 +0800
Subject: [PATCH] remove invalid zmq benchmark code

Signed-off-by: clark <panf2333@gmail.com>
---
 .../disagg_performance_benchmark_zmq_http.sh  | 207 ------------------
 .../disagg_benchmarks/zmq/test_request.py     | 109 ---------
 .../visualize_benchmark_results_zmq_http.py   |  72 ------
 3 files changed, 388 deletions(-)
 delete mode 100644 benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh
 delete mode 100644 benchmarks/disagg_benchmarks/zmq/test_request.py
 delete mode 100644 benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py

diff --git a/benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh b/benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh
deleted file mode 100644
index 11a2f6c2622f4..0000000000000
--- a/benchmarks/disagg_benchmarks/zmq/disagg_performance_benchmark_zmq_http.sh
+++ /dev/null
@@ -1,207 +0,0 @@
-#!/bin/bash
-
-# Requirement: 2x GPUs.
-
-
-# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
-# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
-# Resource: 2x GPU
-# Approaches:
-# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
-# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
-# Prefilling instance: max_output_token=1
-# Decoding instance: force the input tokens be the same across requests to bypass prefilling
-
-set -ex
-
-kill_gpu_processes() {
-  # kill all processes on GPU.
-  pgrep pt_main_thread | xargs -r kill -9
-  pgrep python3 | xargs -r kill -9
-  for port in 7010 7011 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
-  sleep 1
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  local port=$1
-  timeout 1200 bash -c "
-    until curl -s localhost:${port}/v1/completions > /dev/null; do
-      sleep 1
-    done" && return 0 || return 1
-}
-
-launch_chunked_prefill() {
-  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
-  gpu_memory_utilization=0.6
-  max_model_len=10000
-  VLLM_LOGGING_LEVEL=DEBUG CUDA_VISIBLE_DEVICES=0 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
-    --port 8100 \
-    --max-model-len $max_model_len \
-    --enable-chunked-prefill \
-    --gpu-memory-utilization $gpu_memory_utilization &
-  VLLM_LOGGING_LEVEL=DEBUG CUDA_VISIBLE_DEVICES=1 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
-    --port 8200 \
-    --max-model-len $max_model_len \
-    --enable-chunked-prefill \
-    --gpu-memory-utilization $gpu_memory_utilization &
-  wait_for_server 8100
-  wait_for_server 8200
-  python3 ../round_robin_proxy.py &
-  sleep 1
-}
-
-launch_disagg_prefill_http() {
-  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
-  # disagg prefill
-  VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
-    --port 8100 \
-    --max-model-len 10000 \
-    --gpu-memory-utilization 0.6 \
-    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-
-  VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=1 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
-    --port 8200 \
-    --max-model-len 10000 \
-    --gpu-memory-utilization 0.6 \
-    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-
-  wait_for_server 8100
-  wait_for_server 8200
-  python3 ../disagg_prefill_proxy_server.py &
-  sleep 1
-}
-
-
-
-launch_disagg_prefill_zmq() {
-  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
-  gpu_memory_utilization=0.6
-  max_model_len=10000
-  # disagg prefill
-  VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=0 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
-    --port 8100 \
-    --zmq-server-port 7010 \
-    --max-model-len $max_model_len \
-    --gpu-memory-utilization $gpu_memory_utilization \
-    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-
-  VLLM_LOGGING_LEVEL=DEBUG CUDA_LAUNCH_BLOCKING=1 CUDA_VISIBLE_DEVICES=1 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
-    --port 8200 \
-    --zmq-server-port 7011 \
-    --max-model-len $max_model_len \
-    --gpu-memory-utilization $gpu_memory_utilization \
-    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
-
-  python3 \
-    -m vllm.entrypoints.disagg_connector \
-    --port 8000 \
-    --prefill-addr 127.0.0.1:7010 \
-    --decode-addr 127.0.0.1:7011 &
-
-  wait_for_server 8100
-  wait_for_server 8200
-  wait_for_server 8000
-  sleep 1
-}
-
-
-benchmark() {
-  results_folder="./results"
-  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
-  dataset_name="sonnet"
-  dataset_path="../../sonnet_4x.txt"
-  num_prompts=100
-  qps=$1
-  prefix_len=50
-  input_len=1024
-  output_len=$2
-  tag=$3
-
-  python3 ../../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len "$output_len" \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8000 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename "$tag"-qps-"$qps".json \
-          --request-rate "$qps"
-
-  sleep 2
-}
-
-
-main() {
-
-  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-  (which jq) || (apt-get -y install jq)
-  (which socat) || (apt-get -y install socat)
-  (which lsof) || (apt-get -y install lsof)
-  pip install quart httpx matplotlib aiohttp datasets
-  cd "$(dirname "$0")"
-  cd ../..
-  # create sonnet-4x.txt so that we can sample 2048 tokens for input
-  echo "" > sonnet_4x.txt
-  for _ in {1..4}
-  do
-    cat sonnet.txt >> sonnet_4x.txt
-  done
-  cd disagg_benchmarks/zmq
-
-  rm -rf results
-  mkdir results
-  mkdir results/http_zmq_chunk
-  mkdir results/http_zmq
-
-  default_output_len=6
-
-  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
-
-  launch_chunked_prefill
-  for qps in 2 4 6 8 10 12; do
-  benchmark $qps $default_output_len chunked_prefill
-  done
-  kill_gpu_processes
-
-
-  launch_disagg_prefill_http
-  for qps in 2 4 6 8 10 12; do
-  benchmark $qps $default_output_len disagg_prefill_http
-  done
-  kill_gpu_processes
-
-  launch_disagg_prefill_zmq
-  for qps in 2 4 6 8 10 12; do
-  benchmark $qps $default_output_len disagg_prefill_zmq
-  done
-  kill_gpu_processes
-
-  python3 visualize_benchmark_results_zmq_http.py
-
-}
-
-
-main "$@"
diff --git a/benchmarks/disagg_benchmarks/zmq/test_request.py b/benchmarks/disagg_benchmarks/zmq/test_request.py
deleted file mode 100644
index 9a3e82f1ccf45..0000000000000
--- a/benchmarks/disagg_benchmarks/zmq/test_request.py
+++ /dev/null
@@ -1,109 +0,0 @@
-import asyncio
-import json
-
-import aiohttp
-
-
-# test connect completions we assume prefill and decode are on the same node
-# 1. node:vllm serve facebook/opt-125m --port 7001 --zmq-server-port 7010 \
-#   --chat-template ~/vllm/examples/template_chatglm2.jinja
-# 2. vllm connect --prefill-addr 127.0.0.1:7010 --decode-addr 127.0.0.1:7010
-# 3. python test_request.py
-async def test_connect_completions(session):
-    try:
-        base_url = "http://localhost:8001/v1/completions"
-        body = {
-            "temperature": 0.5,
-            "top_p": 0.9,
-            "max_tokens": 150,
-            "frequency_penalty": 1.3,
-            "presence_penalty": 0.2,
-            "repetition_penalty": 1.2,
-            "model": "facebook/opt-125m",
-            "prompt": "Can you introduce vllm?",
-            # "stream": False,
-            "stream": True,
-            "stream_options": {
-                "include_usage": True
-            }
-        }
-        print(f"Sending request to {base_url}, body {body}")
-        async with session.post(base_url, json=body) as response:
-
-            print(response.status)
-            print(response.headers)
-            responseText = ""
-            if response.status == 200:
-                transfer_encoding = response.headers.get('Transfer-Encoding')
-                content_type = response.headers.get('Content-Type')
-                print(f"Transfer-Encoding: {transfer_encoding}")
-                if transfer_encoding == 'chunked':
-                    async for chunk in response.content.iter_chunked(1024):
-                        try:
-                            decoded_chunk = chunk.decode('utf-8')
-                            # print(f"Decoded chunk: {decoded_chunk!r}")
-                            responseText += decoded_chunk
-                        except UnicodeDecodeError:
-                            print(f"Error decoding chunk: {chunk!r}")
-                elif 'application/json' in content_type:
-                    responseText = await response.json()
-                    print(f"response {responseText!r}")
-                else:
-                    # Print the headers and JSON response
-                    print("Unexpected Transfer-Encoding: {} {} {}".format(
-                        transfer_encoding, response.headers, await
-                        response.json()))
-            else:
-                print(f"Request failed with status code {response.status}")
-                print(f"Response : {await response.json()}")
-            print(f"baseurl {base_url}")
-            print(f"response data {extract_data(responseText)}")
-    except aiohttp.ClientError as e:
-        print(f"Error: {e}")
-
-
-def is_json(data):
-    try:
-        json.loads(data)
-        return True
-    except ValueError:
-        return False
-
-
-def extract_data(responseText):
-    reply = ""
-    if responseText == "":
-        return reply
-    if is_json(responseText):
-        return responseText
-
-    for data in responseText.split("\n\n"):
-        if data.startswith('data: '):
-            content = data[6:]
-            if content == "[DONE]":
-                print("DONE")
-                break
-            try:
-                json_data = json.loads(content)
-                choices = json_data["choices"]
-                if len(choices) > 0:
-                    content = choices[0]["text"]
-                    reply += content
-            except json.JSONDecodeError:
-                print(f"Error: Invalid data format: {data}")
-                return reply
-        else:
-            print(f"Error: Invalid data format: {data}")
-
-    return reply
-
-
-async def main():
-    async with aiohttp.ClientSession() as session:
-        tasks = []
-        for _ in range(2):
-            tasks.append(test_connect_completions(session))
-        await asyncio.gather(*tasks)
-
-
-asyncio.run(main())
diff --git a/benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py b/benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py
deleted file mode 100644
index 155a5d5f88986..0000000000000
--- a/benchmarks/disagg_benchmarks/zmq/visualize_benchmark_results_zmq_http.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import json
-
-import matplotlib.pyplot as plt
-import pandas as pd
-
-if __name__ == "__main__":
-    data = []
-    for name in [
-            'disagg_prefill_http', 'disagg_prefill_zmq', 'chunked_prefill'
-    ]:
-        for qps in [2, 4, 6, 8, 10, 12]:
-            with open(f"results/{name}-qps-{qps}.json") as f:
-                x = json.load(f)
-                x['name'] = name
-                x['qps'] = qps
-                data.append(x)
-
-    df = pd.DataFrame.from_dict(data)
-    dis_http_df = df[df['name'] == 'disagg_prefill_http']
-    dis_zmq_df = df[df['name'] == 'disagg_prefill_zmq']
-    chu_df = df[df['name'] == 'chunked_prefill']
-
-    plt.style.use('bmh')
-    plt.rcParams['font.size'] = 20
-
-    for key in [
-            'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
-            'median_itl_ms', 'p99_itl_ms'
-    ]:
-
-        fig, ax = plt.subplots(figsize=(11, 7))
-        plt.plot(dis_http_df['qps'],
-                 dis_http_df[key],
-                 label='disagg_prefill_http',
-                 marker='o',
-                 linewidth=4)
-        plt.plot(dis_zmq_df['qps'],
-                 dis_zmq_df[key],
-                 label='disagg_prefill_zmq',
-                 marker='o',
-                 linewidth=4)
-        plt.plot(chu_df['qps'],
-                 chu_df[key],
-                 label='chunked_prefill',
-                 marker='o',
-                 linewidth=4)
-        ax.legend()
-
-        ax.set_xlabel('QPS')
-        ax.set_ylabel(key)
-        ax.set_ylim(bottom=0)
-        fig.savefig(f'results/http_zmq_chunk/{key}.png')
-        plt.close(fig)
-
-        fig1, ax1 = plt.subplots(figsize=(11, 7))
-        plt.plot(dis_http_df['qps'],
-                 dis_http_df[key],
-                 label='disagg_prefill_http',
-                 marker='o',
-                 linewidth=4)
-        plt.plot(dis_zmq_df['qps'],
-                 dis_zmq_df[key],
-                 label='disagg_prefill_zmq',
-                 marker='o',
-                 linewidth=4)
-        ax1.legend()
-
-        ax1.set_xlabel('QPS')
-        ax1.set_ylabel(key)
-        ax1.set_ylim(bottom=0)
-        fig1.savefig(f'results/http_zmq/{key}.png')
-        plt.close(fig1)