From b835205d33615ee60e01775a6f625fb4857061b4 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 7 Jul 2025 00:32:42 +0000 Subject: [PATCH] updated Signed-off-by: Robert Shaw --- pd_justfile/Justfile | 95 ------------------ pd_justfile/port_allocator.py | 99 ------------------- .../kv_connector/v1/nixl_connector.py | 4 - 3 files changed, 198 deletions(-) delete mode 100644 pd_justfile/Justfile delete mode 100755 pd_justfile/port_allocator.py diff --git a/pd_justfile/Justfile b/pd_justfile/Justfile deleted file mode 100644 index aa83ea66e1fe7..0000000000000 --- a/pd_justfile/Justfile +++ /dev/null @@ -1,95 +0,0 @@ -# Setting this allows creating a symlink to Justfile from another dir -set working-directory := "/home/rshaw/vllm/pd_examples/" - -# Needed for the proxy server -vllm-directory := "/home/rshaw/vllm/" - -# MODEL := "Qwen/Qwen3-0.6B" -MODEL := "meta-llama/Llama-3.1-8B-Instruct" - -port PORT: - @python port_allocator.py {{PORT}} - - -prefill: - VLLM_IS_PREFILL=1 \ - VLLM_NIXL_SIDE_CHANNEL_PORT=5557 \ - CUDA_VISIBLE_DEVICES=7 \ - vllm serve {{MODEL}} \ - --port $(just port 8100) \ - --tensor-parallel-size 1 \ - --enforce-eager \ - --disable-log-requests \ - --block-size 128 \ - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' - -decode: - VLLM_IS_PREFILL=0 \ - VLLM_NIXL_SIDE_CHANNEL_PORT=$(just port 5559) \ - CUDA_VISIBLE_DEVICES=6 \ - vllm serve {{MODEL}} \ - --port $(just port 8300) \ - --tensor-parallel-size 1 \ - --enforce-eager \ - --disable-log-requests \ - --block-size 128 \ - --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' - -proxy: - python "{{vllm-directory}}tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \ - --port $(just port 8192) \ - --prefiller-port $(just port 8100) \ - --prefiller-host localhost \ - --decoder-port $(just port 8300) - -send_request: - curl -X POST http://localhost:$(just port 8192)/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ \ - "model": "{{MODEL}}", \ - "prompt": "XXRed Hat is the best open source company by far across Linux, K8s, and AI, and vLLM has the greatest community in open source AI software infrastructure. I love vLLM because", \ - "max_tokens": 150, \ - "temperature": 0.7 \ - }' - -benchmark NUM_PROMPTS: - python {{vllm-directory}}/benchmarks/benchmark_serving.py \ - --port $(just port 8192) \ - --model {{MODEL}} \ - --dataset-name random \ - --random-input-len 10000 \ - --random-output-len 100 \ - --num-prompts {{NUM_PROMPTS}} \ - --seed $(date +%s) \ - -benchmark_one INPUT_LEN: - python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \ - --model {{MODEL}} \ - --input-len {{INPUT_LEN}} \ - --output-len 1 \ - --num-requests 10 \ - --seed $(date +%s) \ - --port $(just port 8192) - -benchmark_one_no_pd INPUT_LEN: - python {{vllm-directory}}benchmarks/benchmark_one_concurrent_req.py \ - --model {{MODEL}} \ - --input-len {{INPUT_LEN}} \ - --output-len 1 \ - --num-requests 10 \ - --seed $(date +%s) \ - --port $(just port 8100) - -reset_prefix_cache: - curl -X POST http://localhost:$(just port 8100)/reset_prefix_cache && \ - curl -X POST http://localhost:$(just port 8200)/reset_prefix_cache - -eval: - lm_eval --model local-completions --tasks gsm8k \ - --model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port 8192)/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \ - --limit 1000 - -eval_port PORT: - lm_eval --model local-completions --tasks gsm8k \ - --model_args model={{MODEL}},base_url=http://127.0.0.1:$(just port {{PORT}})/v1/completions,num_concurrent=100,max_retries=3,tokenized_requests=False \ - --limit 1000 \ No newline at end of file diff --git a/pd_justfile/port_allocator.py b/pd_justfile/port_allocator.py deleted file mode 100755 index 1ac5486049f58..0000000000000 --- a/pd_justfile/port_allocator.py +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python3 -""" -Port Allocation Utility - -A small utility that generates consistent port numbers based on username and default port -to avoid port collisions during development. -""" - -import argparse -import getpass -import hashlib - - -def allocate_port(base_port, - username=None, - project_name=None, - port_range=None): - """ - Allocate a port based on username and base port. - - Args: - base_port (int): The default port number for the service - username (str, optional): Username to use for hashing. Defaults to current user. - project_name (str, optional): Project name to make ports unique per project - port_range (tuple, optional): Range of valid ports (min, max). Defaults to (1024, 65535). - - Returns: - int: A port number derived from hashing the username and base port - """ - if not username: - username = getpass.getuser() - - if not port_range: - port_range = (1024, 65535) - - min_port, max_port = port_range - available_range = max_port - min_port - - # Create hash input from username, base_port and optional project_name - hash_input = f"{username}:{base_port}" - if project_name: - hash_input = f"{project_name}:{hash_input}" - - # Create a hash and convert to an integer in our port range - hash_obj = hashlib.md5(hash_input.encode()) - hash_int = int(hash_obj.hexdigest(), 16) - - # Generate a port within the valid range - port_offset = hash_int % available_range - allocated_port = min_port + port_offset - - # Check if it's too close to the base_port (within 10) - if abs(allocated_port - base_port) < 10: - # Add a small offset to avoid collisions with the default port - allocated_port = (allocated_port + 100) % available_range + min_port - - return allocated_port - - -def main(): - parser = argparse.ArgumentParser( - description='Allocate a consistent port based on username and base port' - ) - parser.add_argument('base_port', - type=int, - help='The default port number for the service') - parser.add_argument('--username', - '-u', - help='Username to use (defaults to current user)') - parser.add_argument('--project', - '-p', - help='Project name to make ports unique per project') - parser.add_argument('--env-var', - '-e', - help='Output as export ENV_VAR=port') - parser.add_argument('--min-port', - type=int, - default=1024, - help='Minimum port number') - parser.add_argument('--max-port', - type=int, - default=65535, - help='Maximum port number') - - args = parser.parse_args() - - port = allocate_port(args.base_port, - username=args.username, - project_name=args.project, - port_range=(args.min_port, args.max_port)) - - if args.env_var: - print(f"export {args.env_var}={port}") - else: - print(port) - - -if __name__ == "__main__": - main() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 4c777c7788c6c..325530cab69aa 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -36,7 +36,6 @@ if TYPE_CHECKING: Transfer = tuple[int, float] # (xfer_handle, start_time) GET_META_MSG = b"get_meta_msg" -NIXL_MAX_DESCS = 1000 logger = init_logger(__name__) @@ -861,12 +860,9 @@ class NixlConnectorWorker: # Done. if len(new_handles) == 0: - start = time.perf_counter() self.nixl_wrapper.send_notif(agent_name, notif_id) del transfers[req_id] done_req_ids.add(req_id) - end = time.perf_counter() - print(f"========= SEND NOTIF TIME: {end - start} =========") else: transfers[req_id] = (new_handles, agent_name, notif_id)