From 87bf6812b24f92aa34dfb3326c4b6db0ef442927 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 19 Jun 2025 13:15:50 +0000 Subject: [PATCH] updated Signed-off-by: rshaw@neuralmagic.com --- pd_justfile/Justfile | 110 ++++++++++++++++++++++++++++++++++ pd_justfile/port_allocator.py | 99 ++++++++++++++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100644 pd_justfile/Justfile create mode 100755 pd_justfile/port_allocator.py diff --git a/pd_justfile/Justfile b/pd_justfile/Justfile new file mode 100644 index 0000000000000..92f7c6ab2d4dd --- /dev/null +++ b/pd_justfile/Justfile @@ -0,0 +1,110 @@ +# Setting this allows creating a symlink to Justfile from another dir +set working-directory := "/home/rshaw/vllm/pd_examples/" + +# Needed for the proxy server +vllm-directory := "/home/rshaw/vllm/" + +# MODEL := "Qwen/Qwen3-0.6B" +MODEL := "meta-llama/Llama-3.1-8B-Instruct" + +port PORT: + @python port_allocator.py {{PORT}} + + +prefill: + VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5557) \ + CUDA_VISIBLE_DEVICES=0,1 \ + vllm serve {{MODEL}} \ + --port $(just port 8100) \ + --tensor-parallel-size 2 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + +prefill_b: + VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5558) \ + CUDA_VISIBLE_DEVICES=6 \ + vllm serve {{MODEL}} \ + --port $(just port 8200) \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + +decode: + VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5559) \ + CUDA_VISIBLE_DEVICES=2,3,4,5 \ + vllm serve {{MODEL}} \ + --port $(just port 8300) \ + --tensor-parallel-size 2 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + +# proxy: +# python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \ +# --port $(just port 8192) \ +# --prefiller-port $(just port 8100) $(just port 8200) \ +# --prefiller-host localhost localhost \ +# --decoder-port $(just port 8300) + +proxy: + python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \ + --port $(just port 8192) \ + --prefiller-port $(just port 8100) \ + --prefiller-host localhost \ + --decoder-port $(just port 8300) + +send_request: + curl -X POST http://localhost:$(just port 8192)/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ \ + "model": "{{MODEL}}", \ + "prompt": "XXRed Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \ + "max_tokens": 150, \ + "temperature": 0.7 \ + }' + +benchmark NUM_PROMPTS: + python {{vllm-directory}}/benchmarks/benchmark_serving.py \ + --port $(just port 8192) \ + --model {{MODEL}} \ + --dataset-name random \ + --random-input-len 10000 \ + --random-output-len 100 \ + --num-prompts {{NUM_PROMPTS}} \ + --seed $(date +%s) \ + +benchmark_one INPUT_LEN: + python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \ + --model {{MODEL}} \ + --input-len {{INPUT_LEN}} \ + --output-len 1 \ + --num-requests 10 \ + --seed $(date +%s) \ + --port $(just port 8192) + +benchmark_one_no_pd INPUT_LEN: + python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \ + --model {{MODEL}} \ + --input-len {{INPUT_LEN}} \ + --output-len 1 \ + --num-requests 10 \ + --seed $(date +%s) \ + --port $(just port 8100) + +reset_prefix_cache: + curl -X POST http://localhost:$(just port 8100)/reset_prefix_cache && \ + curl -X POST http://localhost:$(just port 8200)/reset_prefix_cache + +eval: + lm_eval --model local-completions --tasks gsm8k \ + --model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port 8192)/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \ + --limit 1000 + +eval_port PORT: + lm_eval --model local-completions --tasks gsm8k \ + --model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port {{PORT}})/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \ + --limit 1000 \ No newline at end of file diff --git a/pd_justfile/port_allocator.py b/pd_justfile/port_allocator.py new file mode 100755 index 0000000000000..1ac5486049f58 --- /dev/null +++ b/pd_justfile/port_allocator.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +""" +Port Allocation Utility + +A small utility that generates consistent port numbers based on username and default port +to avoid port collisions during development. +""" + +import argparse +import getpass +import hashlib + + +def allocate_port(base_port, + username=None, + project_name=None, + port_range=None): + """ + Allocate a port based on username and base port. + + Args: + base_port (int): The default port number for the service + username (str, optional): Username to use for hashing. Defaults to current user. + project_name (str, optional): Project name to make ports unique per project + port_range (tuple, optional): Range of valid ports (min, max). Defaults to (1024, 65535). + + Returns: + int: A port number derived from hashing the username and base port + """ + if not username: + username = getpass.getuser() + + if not port_range: + port_range = (1024, 65535) + + min_port, max_port = port_range + available_range = max_port - min_port + + # Create hash input from username, base_port and optional project_name + hash_input = f"{username}:{base_port}" + if project_name: + hash_input = f"{project_name}:{hash_input}" + + # Create a hash and convert to an integer in our port range + hash_obj = hashlib.md5(hash_input.encode()) + hash_int = int(hash_obj.hexdigest(), 16) + + # Generate a port within the valid range + port_offset = hash_int % available_range + allocated_port = min_port + port_offset + + # Check if it's too close to the base_port (within 10) + if abs(allocated_port - base_port) < 10: + # Add a small offset to avoid collisions with the default port + allocated_port = (allocated_port + 100) % available_range + min_port + + return allocated_port + + +def main(): + parser = argparse.ArgumentParser( + description='Allocate a consistent port based on username and base port' + ) + parser.add_argument('base_port', + type=int, + help='The default port number for the service') + parser.add_argument('--username', + '-u', + help='Username to use (defaults to current user)') + parser.add_argument('--project', + '-p', + help='Project name to make ports unique per project') + parser.add_argument('--env-var', + '-e', + help='Output as export ENV_VAR=port') + parser.add_argument('--min-port', + type=int, + default=1024, + help='Minimum port number') + parser.add_argument('--max-port', + type=int, + default=65535, + help='Maximum port number') + + args = parser.parse_args() + + port = allocate_port(args.base_port, + username=args.username, + project_name=args.project, + port_range=(args.min_port, args.max_port)) + + if args.env_var: + print(f"export {args.env_var}={port}") + else: + print(port) + + +if __name__ == "__main__": + main()