From 87bf6812b24f92aa34dfb3326c4b6db0ef442927 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <robertgshaw2@gmail.com>
Date: Thu, 19 Jun 2025 13:15:50 +0000
Subject: [PATCH] updated

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
---
 pd_justfile/Justfile          | 110 ++++++++++++++++++++++++++++++++++
 pd_justfile/port_allocator.py |  99 ++++++++++++++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100644 pd_justfile/Justfile
 create mode 100755 pd_justfile/port_allocator.py

diff --git a/pd_justfile/Justfile b/pd_justfile/Justfile
new file mode 100644
index 0000000000000..92f7c6ab2d4dd
--- /dev/null
+++ b/pd_justfile/Justfile
@@ -0,0 +1,110 @@
+# Setting this allows creating a symlink to Justfile from another dir
+set working-directory := "/home/rshaw/vllm/pd_examples/"
+
+# Needed for the proxy server
+vllm-directory := "/home/rshaw/vllm/" 
+
+# MODEL := "Qwen/Qwen3-0.6B"
+MODEL := "meta-llama/Llama-3.1-8B-Instruct"
+
+port PORT: 
+  @python port_allocator.py {{PORT}}
+
+
+prefill:
+    VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5557) \
+    CUDA_VISIBLE_DEVICES=0,1 \
+    vllm serve {{MODEL}} \
+      --port $(just port 8100) \
+      --tensor-parallel-size 2 \
+      --enforce-eager \
+      --disable-log-requests \
+      --block-size 128 \
+      --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+
+prefill_b:
+    VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5558) \
+    CUDA_VISIBLE_DEVICES=6 \
+    vllm serve {{MODEL}} \
+      --port $(just port 8200) \
+      --enforce-eager \
+      --disable-log-requests \
+      --block-size 128 \
+      --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+
+decode:
+    VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5559) \
+    CUDA_VISIBLE_DEVICES=2,3,4,5 \
+    vllm serve {{MODEL}} \
+      --port $(just port 8300) \
+      --tensor-parallel-size 2 \
+      --enforce-eager \
+      --disable-log-requests \
+      --block-size 128 \
+      --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
+
+# proxy:
+#     python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
+#       --port $(just port 8192) \
+#       --prefiller-port $(just port 8100) $(just port 8200) \
+#       --prefiller-host localhost localhost \
+#       --decoder-port $(just port 8300)
+
+proxy:
+    python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
+      --port $(just port 8192) \
+      --prefiller-port $(just port 8100) \
+      --prefiller-host localhost \
+      --decoder-port $(just port 8300)
+
+send_request:
+  curl -X POST http://localhost:$(just port 8192)/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{ \
+      "model": "{{MODEL}}", \
+      "prompt": "XXRed Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \
+      "max_tokens": 150, \
+      "temperature": 0.7 \
+    }'
+
+benchmark NUM_PROMPTS:
+  python {{vllm-directory}}/benchmarks/benchmark_serving.py \
+    --port $(just port 8192) \
+    --model {{MODEL}} \
+    --dataset-name random \
+    --random-input-len 10000 \
+    --random-output-len 100 \
+    --num-prompts {{NUM_PROMPTS}} \
+    --seed $(date +%s) \
+
+benchmark_one INPUT_LEN:
+  python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
+    --model {{MODEL}} \
+    --input-len {{INPUT_LEN}} \
+    --output-len 1 \
+    --num-requests 10 \
+    --seed $(date +%s) \
+    --port $(just port 8192)
+
+benchmark_one_no_pd INPUT_LEN:
+  python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
+    --model {{MODEL}} \
+    --input-len {{INPUT_LEN}} \
+    --output-len 1 \
+    --num-requests 10 \
+    --seed $(date +%s) \
+    --port $(just port 8100)
+
+reset_prefix_cache:
+  curl -X POST http://localhost:$(just port 8100)/reset_prefix_cache && \
+  curl -X POST http://localhost:$(just port 8200)/reset_prefix_cache
+
+eval:
+  lm_eval --model local-completions --tasks gsm8k \
+    --model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port 8192)/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
+    --limit 1000
+
+eval_port PORT:
+  lm_eval --model local-completions --tasks gsm8k \
+    --model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port {{PORT}})/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
+    --limit 1000
\ No newline at end of file
diff --git a/pd_justfile/port_allocator.py b/pd_justfile/port_allocator.py
new file mode 100755
index 0000000000000..1ac5486049f58
--- /dev/null
+++ b/pd_justfile/port_allocator.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+"""
+Port Allocation Utility
+
+A small utility that generates consistent port numbers based on username and default port
+to avoid port collisions during development.
+"""
+
+import argparse
+import getpass
+import hashlib
+
+
+def allocate_port(base_port,
+                  username=None,
+                  project_name=None,
+                  port_range=None):
+    """
+    Allocate a port based on username and base port.
+    
+    Args:
+        base_port (int): The default port number for the service
+        username (str, optional): Username to use for hashing. Defaults to current user.
+        project_name (str, optional): Project name to make ports unique per project
+        port_range (tuple, optional): Range of valid ports (min, max). Defaults to (1024, 65535).
+    
+    Returns:
+        int: A port number derived from hashing the username and base port
+    """
+    if not username:
+        username = getpass.getuser()
+
+    if not port_range:
+        port_range = (1024, 65535)
+
+    min_port, max_port = port_range
+    available_range = max_port - min_port
+
+    # Create hash input from username, base_port and optional project_name
+    hash_input = f"{username}:{base_port}"
+    if project_name:
+        hash_input = f"{project_name}:{hash_input}"
+
+    # Create a hash and convert to an integer in our port range
+    hash_obj = hashlib.md5(hash_input.encode())
+    hash_int = int(hash_obj.hexdigest(), 16)
+
+    # Generate a port within the valid range
+    port_offset = hash_int % available_range
+    allocated_port = min_port + port_offset
+
+    # Check if it's too close to the base_port (within 10)
+    if abs(allocated_port - base_port) < 10:
+        # Add a small offset to avoid collisions with the default port
+        allocated_port = (allocated_port + 100) % available_range + min_port
+
+    return allocated_port
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Allocate a consistent port based on username and base port'
+    )
+    parser.add_argument('base_port',
+                        type=int,
+                        help='The default port number for the service')
+    parser.add_argument('--username',
+                        '-u',
+                        help='Username to use (defaults to current user)')
+    parser.add_argument('--project',
+                        '-p',
+                        help='Project name to make ports unique per project')
+    parser.add_argument('--env-var',
+                        '-e',
+                        help='Output as export ENV_VAR=port')
+    parser.add_argument('--min-port',
+                        type=int,
+                        default=1024,
+                        help='Minimum port number')
+    parser.add_argument('--max-port',
+                        type=int,
+                        default=65535,
+                        help='Maximum port number')
+
+    args = parser.parse_args()
+
+    port = allocate_port(args.base_port,
+                         username=args.username,
+                         project_name=args.project,
+                         port_range=(args.min_port, args.max_port))
+
+    if args.env_var:
+        print(f"export {args.env_var}={port}")
+    else:
+        print(port)
+
+
+if __name__ == "__main__":
+    main()