mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-25 05:55:52 +08:00
parent
5b8c64dc77
commit
87bf6812b2
110
pd_justfile/Justfile
Normal file
110
pd_justfile/Justfile
Normal file
@ -0,0 +1,110 @@
|
||||
# Setting this allows creating a symlink to Justfile from another dir
|
||||
set working-directory := "/home/rshaw/vllm/pd_examples/"
|
||||
|
||||
# Needed for the proxy server
|
||||
vllm-directory := "/home/rshaw/vllm/"
|
||||
|
||||
# MODEL := "Qwen/Qwen3-0.6B"
|
||||
MODEL := "meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
||||
port PORT:
|
||||
@python port_allocator.py {{PORT}}
|
||||
|
||||
|
||||
prefill:
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5557) \
|
||||
CUDA_VISIBLE_DEVICES=0,1 \
|
||||
vllm serve {{MODEL}} \
|
||||
--port $(just port 8100) \
|
||||
--tensor-parallel-size 2 \
|
||||
--enforce-eager \
|
||||
--disable-log-requests \
|
||||
--block-size 128 \
|
||||
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
||||
|
||||
prefill_b:
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5558) \
|
||||
CUDA_VISIBLE_DEVICES=6 \
|
||||
vllm serve {{MODEL}} \
|
||||
--port $(just port 8200) \
|
||||
--enforce-eager \
|
||||
--disable-log-requests \
|
||||
--block-size 128 \
|
||||
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
||||
|
||||
decode:
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5559) \
|
||||
CUDA_VISIBLE_DEVICES=2,3,4,5 \
|
||||
vllm serve {{MODEL}} \
|
||||
--port $(just port 8300) \
|
||||
--tensor-parallel-size 2 \
|
||||
--enforce-eager \
|
||||
--disable-log-requests \
|
||||
--block-size 128 \
|
||||
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
||||
|
||||
# proxy:
|
||||
# python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
|
||||
# --port $(just port 8192) \
|
||||
# --prefiller-port $(just port 8100) $(just port 8200) \
|
||||
# --prefiller-host localhost localhost \
|
||||
# --decoder-port $(just port 8300)
|
||||
|
||||
proxy:
|
||||
python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \
|
||||
--port $(just port 8192) \
|
||||
--prefiller-port $(just port 8100) \
|
||||
--prefiller-host localhost \
|
||||
--decoder-port $(just port 8300)
|
||||
|
||||
send_request:
|
||||
curl -X POST http://localhost:$(just port 8192)/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{ \
|
||||
"model": "{{MODEL}}", \
|
||||
"prompt": "XXRed Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \
|
||||
"max_tokens": 150, \
|
||||
"temperature": 0.7 \
|
||||
}'
|
||||
|
||||
benchmark NUM_PROMPTS:
|
||||
python {{vllm-directory}}/benchmarks/benchmark_serving.py \
|
||||
--port $(just port 8192) \
|
||||
--model {{MODEL}} \
|
||||
--dataset-name random \
|
||||
--random-input-len 10000 \
|
||||
--random-output-len 100 \
|
||||
--num-prompts {{NUM_PROMPTS}} \
|
||||
--seed $(date +%s) \
|
||||
|
||||
benchmark_one INPUT_LEN:
|
||||
python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
|
||||
--model {{MODEL}} \
|
||||
--input-len {{INPUT_LEN}} \
|
||||
--output-len 1 \
|
||||
--num-requests 10 \
|
||||
--seed $(date +%s) \
|
||||
--port $(just port 8192)
|
||||
|
||||
benchmark_one_no_pd INPUT_LEN:
|
||||
python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \
|
||||
--model {{MODEL}} \
|
||||
--input-len {{INPUT_LEN}} \
|
||||
--output-len 1 \
|
||||
--num-requests 10 \
|
||||
--seed $(date +%s) \
|
||||
--port $(just port 8100)
|
||||
|
||||
reset_prefix_cache:
|
||||
curl -X POST http://localhost:$(just port 8100)/reset_prefix_cache && \
|
||||
curl -X POST http://localhost:$(just port 8200)/reset_prefix_cache
|
||||
|
||||
eval:
|
||||
lm_eval --model local-completions --tasks gsm8k \
|
||||
--model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port 8192)/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
|
||||
--limit 1000
|
||||
|
||||
eval_port PORT:
|
||||
lm_eval --model local-completions --tasks gsm8k \
|
||||
--model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port {{PORT}})/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \
|
||||
--limit 1000
|
||||
99
pd_justfile/port_allocator.py
Executable file
99
pd_justfile/port_allocator.py
Executable file
@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Port Allocation Utility
|
||||
|
||||
A small utility that generates consistent port numbers based on username and default port
|
||||
to avoid port collisions during development.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import getpass
|
||||
import hashlib
|
||||
|
||||
|
||||
def allocate_port(base_port,
|
||||
username=None,
|
||||
project_name=None,
|
||||
port_range=None):
|
||||
"""
|
||||
Allocate a port based on username and base port.
|
||||
|
||||
Args:
|
||||
base_port (int): The default port number for the service
|
||||
username (str, optional): Username to use for hashing. Defaults to current user.
|
||||
project_name (str, optional): Project name to make ports unique per project
|
||||
port_range (tuple, optional): Range of valid ports (min, max). Defaults to (1024, 65535).
|
||||
|
||||
Returns:
|
||||
int: A port number derived from hashing the username and base port
|
||||
"""
|
||||
if not username:
|
||||
username = getpass.getuser()
|
||||
|
||||
if not port_range:
|
||||
port_range = (1024, 65535)
|
||||
|
||||
min_port, max_port = port_range
|
||||
available_range = max_port - min_port
|
||||
|
||||
# Create hash input from username, base_port and optional project_name
|
||||
hash_input = f"{username}:{base_port}"
|
||||
if project_name:
|
||||
hash_input = f"{project_name}:{hash_input}"
|
||||
|
||||
# Create a hash and convert to an integer in our port range
|
||||
hash_obj = hashlib.md5(hash_input.encode())
|
||||
hash_int = int(hash_obj.hexdigest(), 16)
|
||||
|
||||
# Generate a port within the valid range
|
||||
port_offset = hash_int % available_range
|
||||
allocated_port = min_port + port_offset
|
||||
|
||||
# Check if it's too close to the base_port (within 10)
|
||||
if abs(allocated_port - base_port) < 10:
|
||||
# Add a small offset to avoid collisions with the default port
|
||||
allocated_port = (allocated_port + 100) % available_range + min_port
|
||||
|
||||
return allocated_port
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Allocate a consistent port based on username and base port'
|
||||
)
|
||||
parser.add_argument('base_port',
|
||||
type=int,
|
||||
help='The default port number for the service')
|
||||
parser.add_argument('--username',
|
||||
'-u',
|
||||
help='Username to use (defaults to current user)')
|
||||
parser.add_argument('--project',
|
||||
'-p',
|
||||
help='Project name to make ports unique per project')
|
||||
parser.add_argument('--env-var',
|
||||
'-e',
|
||||
help='Output as export ENV_VAR=port')
|
||||
parser.add_argument('--min-port',
|
||||
type=int,
|
||||
default=1024,
|
||||
help='Minimum port number')
|
||||
parser.add_argument('--max-port',
|
||||
type=int,
|
||||
default=65535,
|
||||
help='Maximum port number')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
port = allocate_port(args.base_port,
|
||||
username=args.username,
|
||||
project_name=args.project,
|
||||
port_range=(args.min_port, args.max_port))
|
||||
|
||||
if args.env_var:
|
||||
print(f"export {args.env_var}={port}")
|
||||
else:
|
||||
print(port)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
x
Reference in New Issue
Block a user