Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
This commit is contained in:
rshaw@neuralmagic.com 2025-06-19 13:15:50 +00:00
parent 5b8c64dc77
commit 87bf6812b2
2 changed files with 209 additions and 0 deletions

110
pd_justfile/Justfile Normal file
View File

@ -0,0 +1,110 @@
# Setting this allows creating a symlink to Justfile from another dir
set working-directory := "/home/rshaw/vllm/pd_examples/"
# Needed for the proxy server
vllm-directory := "/home/rshaw/vllm/"
# MODEL := "Qwen/Qwen3-0.6B"
MODEL := "meta-llama/Llama-3.1-8B-Instruct"
port PORT:
@python port_allocator.py {{PORT}}
prefill:
VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5557) \
CUDA_VISIBLE_DEVICES=0,1 \
vllm serve {{MODEL}} \
--port $(just port 8100) \
--tensor-parallel-size 2 \
--enforce-eager \
--disable-log-requests \
--block-size 128 \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
prefill_b:
VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5558) \
CUDA_VISIBLE_DEVICES=6 \
vllm serve {{MODEL}} \
--port $(just port 8200) \
--enforce-eager \
--disable-log-requests \
--block-size 128 \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
decode:
VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5559) \
CUDA_VISIBLE_DEVICES=2,3,4,5 \
vllm serve {{MODEL}} \
--port $(just port 8300) \
--tensor-parallel-size 2 \
--enforce-eager \
--disable-log-requests \
--block-size 128 \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
# proxy:
# python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
# --port $(just port 8192) \
# --prefiller-port $(just port 8100) $(just port 8200) \
# --prefiller-host localhost localhost \
# --decoder-port $(just port 8300)
proxy:
python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
--port $(just port 8192) \
--prefiller-port $(just port 8100) \
--prefiller-host localhost \
--decoder-port $(just port 8300)
send_request:
curl -X POST http://localhost:$(just port 8192)/v1/completions \
-H "Content-Type: application/json" \
-d '{ \
"model": "{{MODEL}}", \
"prompt": "XXRed Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \
"max_tokens": 150, \
"temperature": 0.7 \
}'
benchmark NUM_PROMPTS:
python {{vllm-directory}}/benchmarks/benchmark_serving.py \
--port $(just port 8192) \
--model {{MODEL}} \
--dataset-name random \
--random-input-len 10000 \
--random-output-len 100 \
--num-prompts {{NUM_PROMPTS}} \
--seed $(date +%s) \
benchmark_one INPUT_LEN:
python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
--model {{MODEL}} \
--input-len {{INPUT_LEN}} \
--output-len 1 \
--num-requests 10 \
--seed $(date +%s) \
--port $(just port 8192)
benchmark_one_no_pd INPUT_LEN:
python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
--model {{MODEL}} \
--input-len {{INPUT_LEN}} \
--output-len 1 \
--num-requests 10 \
--seed $(date +%s) \
--port $(just port 8100)
reset_prefix_cache:
curl -X POST http://localhost:$(just port 8100)/reset_prefix_cache && \
curl -X POST http://localhost:$(just port 8200)/reset_prefix_cache
eval:
lm_eval --model local-completions --tasks gsm8k \
--model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port 8192)/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
--limit 1000
eval_port PORT:
lm_eval --model local-completions --tasks gsm8k \
--model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port {{PORT}})/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
--limit 1000

99
pd_justfile/port_allocator.py Executable file
View File

@ -0,0 +1,99 @@
#!/usr/bin/env python3
"""
Port Allocation Utility
A small utility that generates consistent port numbers based on username and default port
to avoid port collisions during development.
"""
import argparse
import getpass
import hashlib
def allocate_port(base_port,
username=None,
project_name=None,
port_range=None):
"""
Allocate a port based on username and base port.
Args:
base_port (int): The default port number for the service
username (str, optional): Username to use for hashing. Defaults to current user.
project_name (str, optional): Project name to make ports unique per project
port_range (tuple, optional): Range of valid ports (min, max). Defaults to (1024, 65535).
Returns:
int: A port number derived from hashing the username and base port
"""
if not username:
username = getpass.getuser()
if not port_range:
port_range = (1024, 65535)
min_port, max_port = port_range
available_range = max_port - min_port
# Create hash input from username, base_port and optional project_name
hash_input = f"{username}:{base_port}"
if project_name:
hash_input = f"{project_name}:{hash_input}"
# Create a hash and convert to an integer in our port range
hash_obj = hashlib.md5(hash_input.encode())
hash_int = int(hash_obj.hexdigest(), 16)
# Generate a port within the valid range
port_offset = hash_int % available_range
allocated_port = min_port + port_offset
# Check if it's too close to the base_port (within 10)
if abs(allocated_port - base_port) < 10:
# Add a small offset to avoid collisions with the default port
allocated_port = (allocated_port + 100) % available_range + min_port
return allocated_port
def main():
parser = argparse.ArgumentParser(
description='Allocate a consistent port based on username and base port'
)
parser.add_argument('base_port',
type=int,
help='The default port number for the service')
parser.add_argument('--username',
'-u',
help='Username to use (defaults to current user)')
parser.add_argument('--project',
'-p',
help='Project name to make ports unique per project')
parser.add_argument('--env-var',
'-e',
help='Output as export ENV_VAR=port')
parser.add_argument('--min-port',
type=int,
default=1024,
help='Minimum port number')
parser.add_argument('--max-port',
type=int,
default=65535,
help='Maximum port number')
args = parser.parse_args()
port = allocate_port(args.base_port,
username=args.username,
project_name=args.project,
port_range=(args.min_port, args.max_port))
if args.env_var:
print(f"export {args.env_var}={port}")
else:
print(port)
if __name__ == "__main__":
main()