diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml new file mode 100644 index 0000000000000..199c33159fde3 --- /dev/null +++ b/.buildkite/ci_config.yaml @@ -0,0 +1,24 @@ +name: vllm_ci +job_dirs: + - ".buildkite/test_areas" + - ".buildkite/image_build" +run_all_patterns: + - "docker/Dockerfile" + - "CMakeLists.txt" + - "requirements/common.txt" + - "requirements/cuda.txt" + - "requirements/build.txt" + - "requirements/test.txt" + - "setup.py" + - "csrc/" + - "cmake/" +run_all_exclude_patterns: + - "docker/Dockerfile." + - "csrc/cpu/" + - "csrc/rocm/" + - "cmake/hipify.py" + - "cmake/cpu_extension.cmake" +registries: public.ecr.aws/q9t5s3a7 +repositories: + main: "vllm-ci-postmerge-repo" + premerge: "vllm-ci-test-repo" diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py deleted file mode 100644 index bbed80ebe8476..0000000000000 --- a/.buildkite/generate_index.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import os - -template = """ - - -

Links for vLLM

- {x86_wheel}
- {arm_wheel}
- - -""" - -parser = argparse.ArgumentParser() -parser.add_argument("--wheel", help="The wheel path.", required=True) -args = parser.parse_args() - -filename = os.path.basename(args.wheel) - -with open("index.html", "w") as f: - print(f"Generated index.html for {args.wheel}") - # sync the abi tag with .buildkite/scripts/upload-wheels.sh - if "x86_64" in filename: - x86_wheel = filename - arm_wheel = filename.replace("x86_64", "aarch64").replace( - "manylinux1", "manylinux2014" - ) - elif "aarch64" in filename: - x86_wheel = filename.replace("aarch64", "x86_64").replace( - "manylinux2014", "manylinux1" - ) - arm_wheel = filename - else: - raise ValueError(f"Unsupported wheel: {filename}") - # cloudfront requires escaping the '+' character - f.write( - template.format( - x86_wheel=x86_wheel, - x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"), - arm_wheel=arm_wheel, - arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"), - ) - ) diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh new file mode 100755 index 0000000000000..9a2384e524b63 --- /dev/null +++ b/.buildkite/image_build/image_build.sh @@ -0,0 +1,56 @@ +#!/bin/bash +set -e + +if [[ $# -lt 8 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 +BRANCH=$4 +VLLM_USE_PRECOMPILED=$5 +VLLM_MERGE_BASE_COMMIT=$6 +CACHE_FROM=$7 +CACHE_TO=$8 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY +aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com + +# docker buildx +docker buildx create --name vllm-builder --driver docker-container --use +docker buildx inspect --bootstrap +docker buildx ls + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then + merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}" +else + merge_base_commit_build_args="" +fi + +# build +docker buildx build --file docker/Dockerfile \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --build-arg USE_SCCACHE=1 \ + --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \ + --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \ + --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \ + ${merge_base_commit_build_args} \ + --cache-from type=registry,ref=${CACHE_FROM},mode=max \ + --cache-to type=registry,ref=${CACHE_TO},mode=max \ + --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \ + $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \ + --push \ + --target test \ + --progress plain . diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml new file mode 100644 index 0000000000000..d01c71dd9becf --- /dev/null +++ b/.buildkite/image_build/image_build.yaml @@ -0,0 +1,57 @@ +group: Abuild +steps: + - label: ":docker: Build image" + key: image-build + depends_on: [] + commands: + - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build CPU image" + key: image-build-cpu + depends_on: [] + commands: + - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build HPU image" + soft_fail: true + depends_on: [] + key: image-build-hpu + commands: + - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build CPU arm64 image" + key: cpu-arm64-image-build + depends_on: [] + optional: true + commands: + - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh new file mode 100755 index 0000000000000..a69732f430985 --- /dev/null +++ b/.buildkite/image_build/image_build_cpu.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build --file docker/Dockerfile.cpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --build-arg VLLM_CPU_AVX512BF16=true \ + --build-arg VLLM_CPU_AVX512VNNI=true \ + --build-arg VLLM_CPU_AMXBF16=true \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ + --target vllm-test \ + --progress plain . + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh new file mode 100755 index 0000000000000..615298b6555bd --- /dev/null +++ b/.buildkite/image_build/image_build_cpu_arm64.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build --file docker/Dockerfile.cpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ + --target vllm-test \ + --progress plain . + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh new file mode 100755 index 0000000000000..192447ef4577e --- /dev/null +++ b/.buildkite/image_build/image_build_hpu.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build \ + --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \ + --progress plain \ + https://github.com/vllm-project/vllm-gaudi.git + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml index 46f1a9fbf6ff9..6c0b5540cbb6a 100644 --- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml @@ -8,3 +8,4 @@ tasks: value: 0.80 limit: 250 # will run on 250 * 14 subjects = 3500 samples num_fewshot: 5 +rtol: 0.05 diff --git a/.buildkite/lm-eval-harness/configs/models-large-rocm.txt b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt new file mode 100644 index 0000000000000..4fb0b84bc4d81 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large-rocm.txt @@ -0,0 +1 @@ +Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 3627b760eddcf..f94d681197d2d 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \ --tp-size=1 """ +import os +from contextlib import contextmanager + import lm_eval import numpy as np import yaml -RTOL = 0.08 +DEFAULT_RTOL = 0.08 + + +@contextmanager +def scoped_env_vars(new_env: dict[str, str]): + if not new_env: + # Fast path: nothing to do + yield + return + + old_values = {} + new_keys = [] + + try: + for key, value in new_env.items(): + if key in os.environ: + old_values[key] = os.environ[key] + else: + new_keys.append(key) + os.environ[key] = str(value) + yield + finally: + # Restore / clean up + for key, value in old_values.items(): + os.environ[key] = value + for key in new_keys: + os.environ.pop(key, None) def launch_lm_eval(eval_config, tp_size): @@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size): f"trust_remote_code={trust_remote_code}," f"max_model_len={max_model_len}," ) - results = lm_eval.simple_evaluate( - model=backend, - model_args=model_args, - tasks=[task["name"] for task in eval_config["tasks"]], - num_fewshot=eval_config["num_fewshot"], - limit=eval_config["limit"], - # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help - # text models. however, this is regressing measured strict-match for - # existing text models in CI, so only apply it for mm, or explicitly set - apply_chat_template=eval_config.get( - "apply_chat_template", backend == "vllm-vlm" - ), - fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), - # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) - gen_kwargs=eval_config.get("gen_kwargs"), - batch_size=batch_size, - ) + + env_vars = eval_config.get("env_vars", None) + with scoped_env_vars(env_vars): + results = lm_eval.simple_evaluate( + model=backend, + model_args=model_args, + tasks=[task["name"] for task in eval_config["tasks"]], + num_fewshot=eval_config["num_fewshot"], + limit=eval_config["limit"], + # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help + # text models. however, this is regressing measured strict-match for + # existing text models in CI, so only apply it for mm, or explicitly set + apply_chat_template=eval_config.get( + "apply_chat_template", backend == "vllm-vlm" + ), + fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), + # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) + gen_kwargs=eval_config.get("gen_kwargs"), + batch_size=batch_size, + ) return results @@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size): results = launch_lm_eval(eval_config, tp_size) + rtol = eval_config.get("rtol", DEFAULT_RTOL) + success = True for task in eval_config["tasks"]: for metric in task["metrics"]: @@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size): measured_value = results["results"][task["name"]][metric["name"]] print( f"{task['name']} | {metric['name']}: " - f"ground_truth={ground_truth} | measured={measured_value}" + f"ground_truth={ground_truth:.3f} | " + f"measured={measured_value:.3f} | rtol={rtol}" ) - success = success and np.isclose(ground_truth, measured_value, rtol=RTOL) + success = success and np.isclose(ground_truth, measured_value, rtol=rtol) assert success diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index 6d494f64f14fa..64a262c6cb401 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http ## Performance benchmark quick overview -**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models. +**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors, Intel® Gaudi® 3 Accelerators and Arm® Neoverse™ with different models. **Benchmarking Duration**: about 1hr. @@ -23,7 +23,7 @@ bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh Runtime environment variables: -- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. +- `ON_CPU`: set the value to '1' on Intel® Xeon® and Arm® Neoverse™ Processors. Default value is 0. - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). @@ -34,8 +34,9 @@ Runtime environment variables: See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. -For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. -> +> For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. +> For Arm® Neoverse™, use `tests/latency-tests-arm64-cpu.json`, `tests/throughput-tests-arm64-cpu.json`, `tests/serving-tests-arm64-cpu.json` instead. + ### Latency test Here is an example of one test inside `latency-tests.json`: @@ -108,6 +109,65 @@ The number of this test is less stable compared to the delay and latency benchma WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. +#### Default Parameters Field + +We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example: + +
+ An Example of default parameters field + +```json +{ + "defaults": { + "qps_list": [ + "inf" + ], + "server_environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "server_parameters": { + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "block_size": 128, + "disable_log_stats": "", + "load_format": "dummy" + }, + "client_parameters": { + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "num_prompts": 200, + "ignore-eos": "" + } + }, + "tests": [ + { + "test_name": "serving_llama3B_tp2_random_128_128", + "server_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "tensor_parallel_size": 2, + }, + "client_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + } + }, + { + "test_name": "serving_qwen3_tp4_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-14B", + "tensor_parallel_size": 4, + }, + "client_parameters": { + "model": "Qwen/Qwen3-14B", + } + }, + ] +} +``` + +
+ ### Visualizing the results The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh old mode 100644 new mode 100755 index 99a5a5e334f8e..6b6a7e472b9c8 --- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -49,7 +49,11 @@ check_cpus() { echo "Need at least 1 NUMA to run benchmarking." exit 1 fi - declare -g gpu_type="cpu" + if [[ "$(uname -m)" == "aarch64" ]] || [[ "$(uname -m)" == "arm64" ]]; then + declare -g gpu_type="arm64-cpu" + else + declare -g gpu_type="cpu" + fi echo "GPU type is $gpu_type" } @@ -110,7 +114,8 @@ json2envs() { wait_for_server() { # wait for vllm server to start # return 1 if vllm server crashes - timeout 1200 bash -c ' + local timeout_val="1200" + timeout "$timeout_val" bash -c ' until curl -X POST localhost:8000/v1/completions; do sleep 1 done' && return 0 || return 1 @@ -206,8 +211,8 @@ run_latency_tests() { # check if there is enough GPU to run the test tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ]; then - pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size') + if [[ "$ON_CPU" == "1" ]]; then + pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size // 1') world_size=$(($tp*$pp)) if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." @@ -275,8 +280,8 @@ run_throughput_tests() { # check if there is enough GPU to run the test tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ]; then - pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size') + if [[ "$ON_CPU" == "1" ]]; then + pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size // 1') world_size=$(($tp*$pp)) if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." @@ -316,12 +321,44 @@ run_throughput_tests() { run_serving_tests() { # run serving tests using `vllm bench serve` command # $1: a json file specifying serving test cases + # + # Supported JSON formats: + # 1) Plain format: top-level array + # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] + # + # 2) Default parameters field + plain format tests + # { + # "defaults": { ... }, + # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] + # } local serving_test_file serving_test_file=$1 # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do + jq -c ' + if type == "array" then + # Plain format: test cases array + .[] + elif (type == "object" and has("tests")) then + # merge the default parameters into each test cases + . as $root + | ($root.defaults // {}) as $d + | ($root.tests // [])[] + # default qps / max_concurrency from defaults if missing + | .qps_list = (.qps_list // $d.qps_list) + | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list) + # merge envs / params: test overrides defaults + | .server_environment_variables = + (($d.server_environment_variables // {}) + (.server_environment_variables // {})) + | .server_parameters = + (($d.server_parameters // {}) + (.server_parameters // {})) + | .client_parameters = + (($d.client_parameters // {}) + (.client_parameters // {})) + else + error("Unsupported serving test file format: must be array or object with .tests") + end + ' "$serving_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') if [[ ! "$test_name" =~ ^serving_ ]]; then @@ -335,28 +372,33 @@ run_serving_tests() { continue fi - # get client and server arguments + # get client and server arguments (after merged the default parameters) server_params=$(echo "$params" | jq -r '.server_parameters') server_envs=$(echo "$params" | jq -r '.server_environment_variables') client_params=$(echo "$params" | jq -r '.client_parameters') + server_args=$(json2args "$server_params") server_envs=$(json2envs "$server_envs") client_args=$(json2args "$client_params") + + # qps_list qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" + + # max_concurrency_list (fallback to num_prompts if missing) max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then - num_prompts=$(echo "$client_params" | jq -r '.num_prompts') - max_concurrency_list="[$num_prompts]" + num_prompts=$(echo "$client_params" | jq -r '.num_prompts') + max_concurrency_list="[$num_prompts]" fi max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') echo "Running over max concurrency list $max_concurrency_list" # check if there is enough resources to run the test tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ]; then - pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size') + if [[ "$ON_CPU" == "1" ]]; then + pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size // 1') world_size=$(($tp*$pp)) if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." @@ -458,9 +500,9 @@ run_serving_tests() { main() { local ARCH ARCH='' - if [ "$ON_CPU" == "1" ];then - check_cpus - ARCH='-cpu' + if [[ "$ON_CPU" == "1" ]]; then + check_cpus + ARCH="-$gpu_type" else check_gpus ARCH="$arch_suffix" diff --git a/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json new file mode 100644 index 0000000000000..fba695041e3ee --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json @@ -0,0 +1,26 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json new file mode 100644 index 0000000000000..63f1f8ab887b3 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json @@ -0,0 +1,130 @@ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [ + 12, + 16, + 24, + 32, + 64, + 128, + 200 + ], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "ignore-eos": "", + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + } + ] +} \ No newline at end of file diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json deleted file mode 100644 index f758097e098e4..0000000000000 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json +++ /dev/null @@ -1,610 +0,0 @@ -[ - { - "test_name": "serving_llama8B_bf16_tp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - } -] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json deleted file mode 100644 index 0b1a42e790255..0000000000000 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json +++ /dev/null @@ -1,1023 +0,0 @@ -[ - { - "test_name": "serving_llama8B_bf16_pp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_pp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_pp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp2pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_pp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp2pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_pp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp2pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_pp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp2pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - } -] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json index f792956f39472..8f7200862d20c 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -1,276 +1,246 @@ -[ - { - "test_name": "serving_llama8B_tp1_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 32 - } +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 }, - { - "test_name": "serving_llama8B_tp2_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 32 - } + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" }, - { - "test_name": "serving_llama8B_tp1_random_128_128", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 32 - } - }, - { - "test_name": "serving_llama8B_tp2_random_128_128", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 32 - } - }, - { - "test_name": "serving_llama8B_tp1_random_128_2048", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 2048, - "ignore-eos": "", - "num_prompts": 32 - } - }, - { - "test_name": "serving_llama8B_tp2_random_128_2048", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 2048, - "ignore-eos": "", - "num_prompts": 32 - } - }, - { - "test_name": "serving_llama8B_tp1_random_2048_128", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 32 - } - }, - { - "test_name": "serving_llama8B_tp2_random_2048_128", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 32 - } + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "ignore-eos": "", + "num_prompts": 200 } -] + }, + "tests": [ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_128", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp4_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama3B_tp1_random_128_128", + "server_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_granite2B_tp1_random_128_128", + "server_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen1.7B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-1.7B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-1.7B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen4B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-4B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-4B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen8B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-8B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_glm9B_tp1_random_128_128", + "server_parameters": { + "model": "zai-org/glm-4-9b-hf", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "zai-org/glm-4-9b-hf", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_gemma7B_tp1_random_128_128", + "server_parameters": { + "model": "google/gemma-7b", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "google/gemma-7b", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + } + ] +} diff --git a/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json new file mode 100644 index 0000000000000..da84dd4d0c67a --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json @@ -0,0 +1,27 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 38c400ba1faf5..a9d51557bd9bb 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -8,13 +8,28 @@ steps: commands: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" + - label: "Build arm64 wheel - CUDA 13.0" + depends_on: ~ + id: build-wheel-arm64-cuda-13-0 + agents: + queue: arm64_cpu_queue_postmerge + commands: + # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: + # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" + env: + DOCKER_BUILDKIT: "1" + # aarch64 build - label: "Build arm64 CPU wheel" depends_on: ~ @@ -25,24 +40,11 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" env: DOCKER_BUILDKIT: "1" # x86 + CUDA builds - - label: "Build wheel - CUDA 12.8" - depends_on: ~ - id: build-wheel-cuda-12-8 - agents: - queue: cpu_queue_postmerge - commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - - "mkdir artifacts" - - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" - env: - DOCKER_BUILDKIT: "1" - - label: "Build wheel - CUDA 12.9" depends_on: ~ id: build-wheel-cuda-12-9 @@ -52,7 +54,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31" env: DOCKER_BUILDKIT: "1" @@ -65,7 +67,21 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" + env: + DOCKER_BUILDKIT: "1" + + # x86 CPU wheel build + - label: "Build x86 CPU wheel" + depends_on: ~ + id: build-wheel-x86-cpu + agents: + queue: cpu_queue_postmerge + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" env: DOCKER_BUILDKIT: "1" @@ -109,7 +125,6 @@ steps: - label: "Annotate release workflow" depends_on: - create-multi-arch-manifest - - build-wheel-cuda-12-8 id: annotate-release-workflow agents: queue: cpu_queue_postmerge diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 56bb5cedaa0a9..df805e0850806 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -23,8 +23,8 @@ To download the wheel (by version): aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . -aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl . \`\`\` To download and upload the image: @@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker push vllm/vllm-openai:latest-aarch64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 -docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend -docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend +docker manifest rm vllm/vllm-openai:latest +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker manifest push vllm/vllm-openai:latest docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} \`\`\` -EOF \ No newline at end of file +EOF diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py new file mode 100644 index 0000000000000..1794df9479e55 --- /dev/null +++ b/.buildkite/scripts/generate-nightly-index.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# do not complain about line length (for docstring) +# ruff: noqa: E501 + +import argparse +import json +import sys +from dataclasses import asdict, dataclass +from datetime import datetime +from pathlib import Path +from typing import Any +from urllib.parse import quote + +import regex as re + +if not sys.version_info >= (3, 12): + raise RuntimeError("This script requires Python 3.12 or higher.") + +INDEX_HTML_TEMPLATE = """ + + + + +{items} + + +""" + + +@dataclass +class WheelFileInfo: + package_name: str + version: str + build_tag: str | None + python_tag: str + abi_tag: str + platform_tag: str + variant: str | None + filename: str + + +def parse_from_filename(file: str) -> WheelFileInfo: + """ + Parse wheel file name to extract metadata. + + The format of wheel names: + {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl + All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not). + Example: + vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl + vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl + vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl + vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl + """ + wheel_file_re = re.compile( + r"^(?P.+)-(?P[^-]+?)(-(?P[^-]+))?-(?P[^-]+)-(?P[^-]+)-(?P[^-]+)\.whl$" + ) + match = wheel_file_re.match(file) + if not match: + raise ValueError(f"Invalid wheel file name: {file}") + + package_name = match.group("package_name") + version = match.group("version") + build_tag = match.group("build_tag") + python_tag = match.group("python_tag") + abi_tag = match.group("abi_tag") + platform_tag = match.group("platform_tag") + + # extract variant from version + variant = None + if "dev" in version: + ver_after_dev = version.split("dev")[-1] + if "." in ver_after_dev: + variant = ver_after_dev.split(".")[-1] + version = version.removesuffix("." + variant) + else: + if "+" in version: + version, variant = version.split("+") + + return WheelFileInfo( + package_name=package_name, + version=version, + build_tag=build_tag, + python_tag=python_tag, + abi_tag=abi_tag, + platform_tag=platform_tag, + variant=variant, + filename=file, + ) + + +def generate_project_list(subdir_names: list[str], comment: str = "") -> str: + """ + Generate project list HTML content linking to each project & variant sub-directory. + """ + href_tags = [] + for name in sorted(subdir_names): + name = name.strip("/").strip(".") + href_tags.append(f' {name}/
') + return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment) + + +def generate_package_index_and_metadata( + wheel_files: list[WheelFileInfo], + wheel_base_dir: Path, + index_base_dir: Path, + comment: str = "", +) -> tuple[str, str]: + """ + Generate package index HTML content for a specific package, linking to actual wheel files. + """ + href_tags = [] + metadata = [] + for file in sorted(wheel_files, key=lambda x: x.filename): + relative_path = ( + wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename + ) + # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B' + # NOTE: this is AWS S3 specific behavior! + file_path_quoted = quote(relative_path.as_posix(), safe=":%/") + href_tags.append(f' {file.filename}
') + file_meta = asdict(file) + file_meta["path"] = file_path_quoted + metadata.append(file_meta) + index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment) + metadata_str = json.dumps(metadata, indent=2) + return index_str, metadata_str + + +def generate_index_and_metadata( + whl_files: list[str], + wheel_base_dir: Path, + index_base_dir: Path, + default_variant: str | None = None, + alias_to_default: str | None = None, + comment: str = "", +): + """ + Generate index for all wheel files. + + Args: + whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`). + wheel_base_dir (Path): Base directory for wheel files. + index_base_dir (Path): Base directory to store index files. + default_variant (str | None): The default variant name, if any. + alias_to_default (str | None): Alias variant name for the default variant, if any. + comment (str | None): Optional comment to include in the generated HTML files. + + First, parse all wheel files to extract metadata. + We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory). + The index for the default variant (if any) is generated in the root index directory. + + If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index + is purely a copy of the corresponding variant index, with only the links adjusted. + Otherwise, all wheels without variant suffixes are treated as the default variant. + + If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content + as the default variant index, but the links are adjusted accordingly. + + Index directory structure: + index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/) + index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories + vllm/ + index.html # package index, pointing to actual files in wheel_base_dir (relative path) + metadata.json # machine-readable metadata for all wheels in this package + cpu/ # cpu variant sub-directory + index.html + vllm/ + index.html + metadata.json + cu129/ # cu129 is actually the alias to default variant + index.html + vllm/ + index.html + metadata.json + cu130/ # cu130 variant sub-directory + index.html + vllm/ + index.html + metadata.json + ... + + metadata.json stores a dump of all wheel files' metadata in a machine-readable format: + [ + { + "package_name": "vllm", + "version": "0.10.2rc2", + "build_tag": null, + "python_tag": "cp38", + "abi_tag": "abi3", + "platform_tag": "manylinux2014_aarch64", + "variant": "cu129", + "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl", + "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded + }, + ... + ] + """ + + parsed_files = [parse_from_filename(f) for f in whl_files] + + if not parsed_files: + print("No wheel files found, skipping index generation.") + return + + # Group by variant + variant_to_files: dict[str, list[WheelFileInfo]] = {} + for file in parsed_files: + variant = file.variant or "default" + if variant not in variant_to_files: + variant_to_files[variant] = [] + variant_to_files[variant].append(file) + + print(f"Found variants: {list(variant_to_files.keys())}") + + # sanity check for default variant + if default_variant: + if "default" in variant_to_files: + raise ValueError( + "All wheel files must have variant suffixes when `default_variant` is specified." + ) + if default_variant not in variant_to_files: + raise ValueError( + f"Default variant '{default_variant}' not found among wheel files." + ) + + if alias_to_default: + if "default" not in variant_to_files: + # e.g. only some wheels are uploaded to S3 currently + print( + "[WARN] Alias to default variant specified, but no default variant found." + ) + elif alias_to_default in variant_to_files: + raise ValueError( + f"Alias variant name '{alias_to_default}' already exists among wheel files." + ) + else: + variant_to_files[alias_to_default] = variant_to_files["default"].copy() + print(f"Alias variant '{alias_to_default}' created for default variant.") + + # Generate comment in HTML header + comment_str = f" ({comment})" if comment else "" + comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}" + + # Generate index for each variant + subdir_names = set() + for variant, files in variant_to_files.items(): + if variant == "default": + variant_dir = index_base_dir + else: + variant_dir = index_base_dir / variant + subdir_names.add(variant) + + variant_dir.mkdir(parents=True, exist_ok=True) + + # gather all package names in this variant + packages = set(f.package_name for f in files) + if variant == "default": + # these packages should also appear in the "project list" + # generate after all variants are processed + subdir_names = subdir_names.union(packages) + else: + # generate project list for this variant directly + project_list_str = generate_project_list(sorted(packages), comment_tmpl) + with open(variant_dir / "index.html", "w") as f: + f.write(project_list_str) + + for package in packages: + # filter files belonging to this package only + package_files = [f for f in files if f.package_name == package] + package_dir = variant_dir / package + package_dir.mkdir(parents=True, exist_ok=True) + index_str, metadata_str = generate_package_index_and_metadata( + package_files, wheel_base_dir, package_dir, comment + ) + with open(package_dir / "index.html", "w") as f: + f.write(index_str) + with open(package_dir / "metadata.json", "w") as f: + f.write(metadata_str) + + # Generate top-level project list index + project_list_str = generate_project_list(sorted(subdir_names), comment_tmpl) + with open(index_base_dir / "index.html", "w") as f: + f.write(project_list_str) + + +if __name__ == "__main__": + """ + Arguments: + --version : version string for the current build (e.g., commit hash) + --wheel-dir : directory containing wheel files (default to be same as `version`) + --current-objects : path to JSON file containing current S3 objects listing in this version directory + --output-dir : directory to store generated index files + --alias-to-default : (optional) alias variant name for the default variant + --comment : (optional) comment string to include in generated HTML files + """ + + parser = argparse.ArgumentParser( + description="Process nightly build wheel files to generate indices." + ) + parser.add_argument( + "--version", + type=str, + required=True, + help="Version string for the current build (e.g., commit hash)", + ) + parser.add_argument( + "--current-objects", + type=str, + required=True, + help="Path to JSON file containing current S3 objects listing in this version directory", + ) + parser.add_argument( + "--output-dir", + type=str, + required=True, + help="Directory to store generated index files", + ) + parser.add_argument( + "--wheel-dir", + type=str, + default=None, + help="Directory containing wheel files (default to be same as `version`)", + ) + parser.add_argument( + "--alias-to-default", + type=str, + default=None, + help="Alias variant name for the default variant", + ) + parser.add_argument( + "--comment", + type=str, + default="", + help="Optional comment string to include in generated HTML files", + ) + + args = parser.parse_args() + + version = args.version + if "/" in version or "\\" in version: + raise ValueError("Version string must not contain slashes.") + current_objects_path = Path(args.current_objects) + output_dir = Path(args.output_dir) + if not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) + + # Read current objects JSON + with open(current_objects_path) as f: + current_objects: dict[str, list[dict[str, Any]]] = json.load(f) + + # current_objects looks like from list_objects_v2 S3 API: + """ + "Contents": [ + { + "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl", + "LastModified": "2025-11-28T14:00:32+00:00", + "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"", + "ChecksumAlgorithm": [ + "CRC64NVME" + ], + "ChecksumType": "FULL_OBJECT", + "Size": 435649349, + "StorageClass": "STANDARD" + }, + ... + ] + """ + + # Extract wheel file keys + wheel_files = [] + for item in current_objects.get("Contents", []): + key: str = item["Key"] + if key.endswith(".whl"): + wheel_files.append(key.split("/")[-1]) # only the filename is used + + print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}") + + # keep only "official" files for a non-nightly version (specified by cli args) + PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$") + if PY_VERSION_RE.match(version): + # upload-wheels.sh ensures no "dev" is in args.version + wheel_files = list( + filter(lambda x: version in x and "dev" not in x, wheel_files) + ) + print(f"Non-nightly version detected, wheel files used: {wheel_files}") + else: + print("Nightly version detected, keeping all wheel files.") + + # Generate index and metadata, assuming wheels and indices are stored as: + # s3://vllm-wheels/{wheel_dir}/ + # s3://vllm-wheels// + wheel_dir = args.wheel_dir or version + wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/") + index_base_dir = Path(output_dir) + + generate_index_and_metadata( + whl_files=wheel_files, + wheel_base_dir=wheel_base_dir, + index_base_dir=index_base_dir, + default_variant=None, + alias_to_default=args.alias_to_default, + comment=args.comment.strip(), + ) + print(f"Successfully generated index and metadata in {output_dir}") diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 864eb470bb0a7..08da34d81d117 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -141,7 +141,6 @@ if [[ $commands == *" entrypoints/openai "* ]]; then --ignore=entrypoints/openai/test_audio.py \ --ignore=entrypoints/openai/test_shutdown.py \ --ignore=entrypoints/openai/test_completion.py \ - --ignore=entrypoints/openai/test_sleep.py \ --ignore=entrypoints/openai/test_models.py \ --ignore=entrypoints/openai/test_lora_adapters.py \ --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh new file mode 100755 index 0000000000000..b6274d698d01a --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# allow to bind to different cores +CORE_RANGE=${CORE_RANGE:-0-16} +OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16} + +export CMAKE_BUILD_PARALLEL_LEVEL=16 + +# Setup cleanup +remove_docker_container() { + set -e; + docker rm -f cpu-test || true; +} +trap remove_docker_container EXIT +remove_docker_container + +# Try building the docker image +docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu . + +# Run the image +docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test + +function cpu_tests() { + set -e + + docker exec cpu-test bash -c " + set -e + pip list" + + # offline inference + docker exec cpu-test bash -c " + set -e + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + + # Run model tests + docker exec cpu-test bash -c " + set -e + pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model" + + # Run kernel tests + docker exec cpu-test bash -c " + set -e + pytest -x -v -s tests/kernels/test_onednn.py + pytest -x -v -s tests/kernels/attention/test_cpu_attn.py + pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic" + + # basic online serving + docker exec cpu-test bash -c ' + set -e + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model Qwen/Qwen3-0.6B \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' +} + +# All of CPU tests are expected to be finished less than 40 mins. +export -f cpu_tests +timeout 2h bash -c cpu_tests diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh index 39ea180173081..3728f73fa2a36 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -25,20 +25,22 @@ function cpu_tests() { # offline inference podman exec -it "$container_id" bash -c " + export TORCH_COMPILE_DISABLE=1 set -xve python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log # Run basic model test podman exec -it "$container_id" bash -c " + export TORCH_COMPILE_DISABLE=1 set -evx pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib - pip install sentence-transformers datamodel_code_generator + pip install sentence-transformers datamodel_code_generator tblib # Note: disable Bart until supports V1 # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model - pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2] - pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] - pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] + pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2] + pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m] + pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 2267718f75ca5..471c8616df85c 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -21,8 +21,8 @@ trap remove_docker_container EXIT remove_docker_container # Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" @@ -50,6 +50,7 @@ function cpu_tests() { docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -x -v -s tests/kernels/attention/test_cpu_attn.py + pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py pytest -x -v -s tests/kernels/test_onednn.py" # Run basic model test diff --git a/.buildkite/scripts/hardware_ci/run-npu-test.sh b/.buildkite/scripts/hardware_ci/run-npu-test.sh index 29c8f5ed5a91a..0db1abe37ba11 100644 --- a/.buildkite/scripts/hardware_ci/run-npu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-npu-test.sh @@ -74,6 +74,7 @@ FROM ${BASE_IMAGE_NAME} # Define environments ENV DEBIAN_FRONTEND=noninteractive +ENV SOC_VERSION="ascend910b1" RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \ pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \ diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index d49f3e2f47cf1..85b554e5e8646 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -35,10 +35,11 @@ docker run \ echo $ZE_AFFINITY_MASK pip install tblib==3.1.0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp - VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + python3 examples/offline_inference/basic/generate.py --model Intel/Qwen2.5-0.5B-W4A16-G128-AutoRound-LLMC-TEST-ONLY --enforce-eager + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN cd tests pytest -v -s v1/core pytest -v -s v1/engine @@ -46,6 +47,6 @@ docker run \ pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py - pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py pytest -v -s v1/test_serial_utils.py ' diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh index 5b25c358fc4aa..3fb7c82c8d333 100755 --- a/.buildkite/scripts/run-prime-rl-test.sh +++ b/.buildkite/scripts/run-prime-rl-test.sh @@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git" PRIME_RL_DIR="${REPO_ROOT}/prime-rl" +if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then + echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..." + exit 0 +fi + echo "Setting up Prime-RL integration test environment..." # Clean up any existing Prime-RL directory diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 5302f524a0ae4..8106f50f18f66 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -17,7 +17,17 @@ wait_for_server() { } MODEL="deepseek-ai/DeepSeek-V2-lite" -BACKENDS=("deepep_high_throughput" "deepep_low_latency") + +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi cleanup() { if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh new file mode 100644 index 0000000000000..d0921c5699d5d --- /dev/null +++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +set -euxo pipefail + +# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE] +THRESHOLD=${1:-0.8} +NUM_Q=${2:-1319} +PORT=${3:-8020} +DATA_PARALLEL_SIZE=${4:-2} +TENSOR_PARALLEL_SIZE=${5:-2} +OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} +mkdir -p "${OUT_DIR}" + +wait_for_server() { + local port=$1 + timeout 600 bash -c ' + until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do + sleep 1 + done' +} + +MODEL="QWen/Qwen3-30B-A3B-FP8" +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi + +cleanup() { + if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then + kill "${SERVER_PID}" 2>/dev/null || true + for _ in {1..20}; do + kill -0 "${SERVER_PID}" 2>/dev/null || break + sleep 0.5 + done + kill -9 "${SERVER_PID}" 2>/dev/null || true + fi +} +trap cleanup EXIT + +for BACK in "${BACKENDS[@]}"; do + VLLM_DEEP_GEMM_WARMUP=skip \ + vllm serve "$MODEL" \ + --enforce-eager \ + --enable-eplb \ + --all2all-backend $BACK \ + --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \ + --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ + --data-parallel-size ${DATA_PARALLEL_SIZE} \ + --enable-expert-parallel \ + --trust-remote-code \ + --max-model-len 2048 \ + --port $PORT & + SERVER_PID=$! + wait_for_server $PORT + + TAG=$(echo "$MODEL" | tr '/: \\n' '_____') + OUT="${OUT_DIR}/${TAG}_${BACK}.json" + python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} + python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" +PY + + cleanup + SERVER_PID= + sleep 1 + PORT=$((PORT+1)) +done diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh similarity index 64% rename from .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh rename to .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh index a5135299297e2..b3b65128e6062 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh @@ -2,9 +2,9 @@ set -euxo pipefail # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] -THRESHOLD=${1:-0.8} +THRESHOLD=${1:-0.25} NUM_Q=${2:-1319} -PORT=${3:-8020} +PORT=${3:-8040} OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} mkdir -p "${OUT_DIR}" @@ -16,8 +16,18 @@ wait_for_server() { done' } -MODEL="QWen/Qwen3-30B-A3B-FP8" -BACKENDS=("deepep_high_throughput" "deepep_low_latency") +MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct" + +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi cleanup() { if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then @@ -33,14 +43,17 @@ trap cleanup EXIT for BACK in "${BACKENDS[@]}"; do VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ vllm serve "$MODEL" \ --enforce-eager \ - --tensor-parallel-size 2 \ - --data-parallel-size 2 \ + --tensor-parallel-size 4 \ --enable-expert-parallel \ + --enable-eplb \ + --all2all-backend $BACK \ + --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ + --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ --trust-remote-code \ --max-model-len 2048 \ + --gpu-memory-utilization 0.9 \ --port $PORT & SERVER_PID=$! wait_for_server $PORT diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 945c5e48c0090..1af7f476ae74b 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -2,6 +2,28 @@ set -ex +# ======== part 0: setup ======== + +BUCKET="vllm-wheels" +INDICES_OUTPUT_DIR="indices" +DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py +PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3 +SUBPATH=$BUILDKITE_COMMIT +S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" + +# detect if python3.10+ is available +has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)") +if [[ "$has_new_python" -eq 0 ]]; then + # use new python from docker + docker pull python:3-slim + PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3" +fi + +echo "Using python interpreter: $PYTHON" +echo "Python version: $($PYTHON --version)" + +# ========= part 1: collect, rename & upload the wheel ========== + # Assume wheels are in artifacts/dist/*.whl wheel_files=(artifacts/dist/*.whl) @@ -10,74 +32,77 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" exit 1 fi - -# Get the single wheel file wheel="${wheel_files[0]}" -# Detect architecture and rename 'linux' to appropriate manylinux version -arch=$(uname -m) -if [[ $arch == "x86_64" ]]; then - manylinux_version="manylinux1" -elif [[ $arch == "aarch64" ]]; then - manylinux_version="manylinux2014" -else - echo "Warning: Unknown architecture $arch, using manylinux1 as default" - manylinux_version="manylinux1" -fi +# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31 +# we also accept params as manylinux tag +# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels +manylinux_version="${1:-manylinux_2_31}" # Rename 'linux' to the appropriate manylinux version in the wheel filename +if [[ "$wheel" != *"linux"* ]]; then + echo "Error: Wheel filename does not contain 'linux': $wheel" + exit 1 +fi new_wheel="${wheel/linux/$manylinux_version}" mv -- "$wheel" "$new_wheel" wheel="$new_wheel" +echo "Renamed wheel to: $wheel" # Extract the version from the wheel version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) -echo "Version: $version" +echo "Version in wheel: $version" +pure_version="${version%%+*}" +echo "Pure version (without variant): $pure_version" -normal_wheel="$wheel" # Save the original wheel filename +# copy wheel to its own bucket +aws s3 cp "$wheel" "$S3_COMMIT_PREFIX" -# If the version contains "dev", rename it to v1.0.0.dev for consistency -if [[ $version == *dev* ]]; then - suffix="${version##*.}" - if [[ $suffix == cu* ]]; then - new_version="1.0.0.dev+${suffix}" - else - new_version="1.0.0.dev" - fi - new_wheel="${wheel/$version/$new_version}" - # use cp to keep both files in the artifacts directory - cp -- "$wheel" "$new_wheel" - wheel="$new_wheel" - version="$new_version" -fi +# ========= part 2: generate and upload indices ========== +# generate indices for all existing wheels in the commit directory +# this script might be run multiple times if there are multiple variants being built +# so we need to guarantee there is little chance for "TOCTOU" issues +# i.e., one process is generating indices while another is uploading a new wheel +# so we need to ensure no time-consuming operations happen below -# Upload the wheel to S3 -python3 .buildkite/generate_index.py --wheel "$normal_wheel" +# list all wheels in the commit directory +echo "Existing wheels on S3:" +aws s3 ls "$S3_COMMIT_PREFIX" +obj_json="objects.json" +aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json" +mkdir -p "$INDICES_OUTPUT_DIR" -# generate index for this commit -aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" -aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" - -if [[ $normal_wheel == *"cu129"* ]]; then - # only upload index.html for cu129 wheels (default wheels) as it - # is available on both x86 and arm64 - aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" - aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" +# call script to generate indicies for all existing wheels +# this indices have relative paths that could work as long as it is next to the wheel directory in s3 +# i.e., the wheels are always in s3://vllm-wheels// +# and indices can be placed in //, or /nightly/, or // +if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then + alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS" else - echo "Skipping index files for non-cu129 wheels" + alias_arg="" fi -# generate index for nightly -aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" -aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" +# HACK: we do not need regex module here, but it is required by pre-commit hook +# To avoid any external dependency, we simply replace it back to the stdlib re module +sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py +$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "commit $BUILDKITE_COMMIT" $alias_arg -if [[ $normal_wheel == *"cu129"* ]]; then - # only upload index.html for cu129 wheels (default wheels) as it - # is available on both x86 and arm64 - aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" -else - echo "Skipping index files for non-cu129 wheels" +# copy indices to // unconditionally +echo "Uploading indices to $S3_COMMIT_PREFIX" +aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX" + +# copy to /nightly/ only if it is on the main branch and not a PR +if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then + echo "Uploading indices to overwrite /nightly/" + aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/" fi -aws s3 cp "$wheel" "s3://vllm-wheels/$version/" -aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html" +# re-generate and copy to // only if it does not have "dev" in the version +if [[ "$version" != *"dev"* ]]; then + echo "Re-generating indices for /$pure_version/" + rm -rf "$INDICES_OUTPUT_DIR/*" + mkdir -p "$INDICES_OUTPUT_DIR" + # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path + $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg + aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" +fi diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 37c6bd4276722..400c744852ab6 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -39,9 +39,9 @@ steps: # if this test fails, it means the nightly torch version is not compatible with some # of the dependencies. Please check the error message and add the package to whitelist # in /vllm/tools/pre_commit/generate_nightly_torch_test.py - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] agent_pool: mi325_1 - # grade: Blocking + grade: Blocking soft_fail: true source_file_dependencies: - requirements/nightly_torch_test.txt @@ -50,9 +50,9 @@ steps: - label: Async Engine, Inputs, Utils, Worker Test # 10min timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] agent_pool: mi325_1 - # grade: Blocking + grade: Blocking source_file_dependencies: - vllm/ - tests/multimodal @@ -61,25 +61,31 @@ steps: - pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s utils_ -- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins - timeout_in_minutes: 10 - mirror_hardwares: [amdexperimental, amdproduction] +- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] agent_pool: mi325_1 - # grade: Blocking + grade: Blocking source_file_dependencies: - vllm/ - tests/test_inputs.py - tests/test_outputs.py - tests/multimodal - tests/standalone_tests/lazy_imports.py + - tests/tokenizers_ + - tests/tool_parsers - tests/transformers_utils + - tests/config no_gpu: true commands: - python3 standalone_tests/lazy_imports.py - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s tokenizers_ + - pytest -v -s tool_parsers - pytest -v -s transformers_utils + - pytest -v -s config - label: Python-only Installation Test # 10min timeout_in_minutes: 20 @@ -111,9 +117,9 @@ steps: - pytest -v -s basic_correctness/test_cpu_offload.py - label: Entrypoints Unit Tests # 5min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] agent_pool: mi325_1 - # grade: Blocking + grade: Blocking timeout_in_minutes: 10 working_dir: "/vllm-workspace/tests" fast_check: true @@ -122,7 +128,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -142,7 +148,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration Test (API Server) # 100min +- label: Entrypoints Integration Test (API Server 1) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 @@ -156,10 +162,31 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py + # Need tf32 to avoid conflicting precision issue with terratorch on ROCm. + # TODO: Remove after next torch update + - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/sleep + - tests/entrypoints/rpc + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/sleep + - pytest -v -s tool_use + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] @@ -210,6 +237,7 @@ steps: # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py @@ -248,9 +276,9 @@ steps: - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep - label: EPLB Algorithm Test # 5min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] agent_pool: mi325_1 - # grade: Blocking + grade: Blocking timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" source_file_dependencies: @@ -306,28 +334,27 @@ steps: - pytest -v -s test_regression.py working_dir: "/vllm-workspace/tests" # optional -- label: Engine Test # 25min - timeout_in_minutes: 40 +- label: Engine Test # 9min + timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - vllm/ - tests/engine - - tests/tokenization - tests/test_sequence - tests/test_config - tests/test_logger - tests/test_vllm_port commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - # OOM in the CI unless we run this separately - - pytest -v -s tokenization -- label: V1 Test e2e + engine # 30min - timeout_in_minutes: 45 +- label: V1 Test e2e + engine # 65min + timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 + # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. + # See discussion here: https://github.com/vllm-project/vllm/pull/31040 + agent_pool: mi325_8 # grade: Blocking source_file_dependencies: - vllm/ @@ -340,9 +367,9 @@ steps: - label: V1 Test entrypoints # 35min timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] agent_pool: mi325_1 - # grade: Blocking + grade: Blocking source_file_dependencies: - vllm/ - tests/v1 @@ -390,10 +417,34 @@ steps: commands: - pytest -v -s v1/attention -- label: V1 Test others (CPU) # 5 mins - mirror_hardwares: [amdexperimental, amdproduction] +- label: Batch Invariance Tests (H100) # 10min + mirror_hardwares: [amdexperimental] agent_pool: mi325_1 - # grade: Blocking + timeout_in_minutes: 25 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + +- label: V1 Test attention (B200) # 10min + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this + +- label: V1 Test others (CPU) # 5 mins + mirror_hardwares: [amdexperimental, amdproduction, amdtentative] + agent_pool: mi325_1 + grade: Blocking source_file_dependencies: - vllm/ - tests/v1 @@ -409,29 +460,34 @@ steps: - label: Examples Test # 30min timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints + - vllm/multimodal - examples/ commands: - pip install tensorizer # for tensorizer test + # for basic + - python3 offline_inference/basic/chat.py - python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/audio_language.py --seed 0 - - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_pooling.py --seed 0 - - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py + # for multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -485,7 +541,7 @@ steps: - label: PyTorch Compilation Unit Tests # 15min timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -502,7 +558,7 @@ steps: - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -529,7 +585,7 @@ steps: - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Cudagraph test timeout_in_minutes: 20 @@ -558,7 +614,7 @@ steps: - label: Kernels Attention Test %N # 23min timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_8 # grade: Blocking source_file_dependencies: @@ -585,7 +641,7 @@ steps: - label: Kernels MoE Test %N # 40min timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_8 # grade: Blocking source_file_dependencies: @@ -612,6 +668,26 @@ steps: commands: - pytest -v -s kernels/mamba +- label: Kernels DeepGEMM Test (H100) # Nvidia-centric +# Not replicating for CUTLAS & CuTe + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + - label: Model Executor Test # 23min timeout_in_minutes: 35 torch_nightly: true @@ -669,19 +745,21 @@ steps: # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - - uv pip install --system torchao==0.13.0 + - uv pip install --system torchao==0.14.1 + - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py -- label: LM Eval Small Models # 15min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction] +- label: LM Eval Small Models # 53min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization + autorun_on_main: true commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - label: OpenAI API correctness # 10min timeout_in_minutes: 15 @@ -692,33 +770,10 @@ steps: - csrc/ - vllm/entrypoints/openai/ - vllm/model_executor/models/whisper.py - commands: # LMEval + commands: # LMEval+Transcription WER check # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - - pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py + - pytest -s entrypoints/openai/correctness/ -- label: OpenAI-Compatible Tool Use # 23 min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s -m 'not cpu_test' tool_use - -- label: OpenAI-Compatible Tool Use (CPU) # 5 mins - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - timeout_in_minutes: 10 - source_file_dependencies: - - vllm/ - - tests/tool_use - no_gpu: true - commands: - - pytest -v -s -m 'cpu_test' tool_use ##### models test ##### @@ -743,6 +798,7 @@ steps: torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ + - vllm/transformers_utils/ - tests/models/test_initialization.py commands: # Only when vLLM model source is modified - test initialization of a large @@ -888,6 +944,18 @@ steps: commands: - pytest -v -s models/language/pooling_mteb_test +- label: Multi-Modal Processor Test (CPU) + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + no_gpu: true + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + - label: Multi-Modal Processor Test # 44min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] @@ -901,7 +969,7 @@ steps: - pytest -v -s models/multimodal/processing - label: Multi-Modal Models Test (Standard) # 60min - timeout_in_minutes: 80 + timeout_in_minutes: 100 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 # grade: Blocking @@ -910,13 +978,18 @@ steps: - vllm/ - tests/models/multimodal commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py + # Need tf32 to avoid conflicting precision issue with terratorch on ROCm. + # TODO: Remove after next torch update + - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work -- label: Multi-Modal Accuracy Eval (Small Models) # 10min - timeout_in_minutes: 70 +- label: Multi-Modal Accuracy Eval (Small Models) # 5min + timeout_in_minutes: 10 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking @@ -926,9 +999,12 @@ steps: - vllm/inputs/ - vllm/v1/core/ commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt -- label: Multi-Modal Models Test (Extended) 1 +- label: Multi-Modal Models Test (Extended) 1 # 60min + timeout_in_minutes: 120 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 # grade: Blocking @@ -937,10 +1013,13 @@ steps: - vllm/ - tests/models/multimodal commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing -- label: Multi-Modal Models Test (Extended) 2 +- label: Multi-Modal Models Test (Extended) 2 #60min + timeout_in_minutes: 120 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 # grade: Blocking @@ -949,10 +1028,13 @@ steps: - vllm/ - tests/models/multimodal commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' -- label: Multi-Modal Models Test (Extended) 3 +- label: Multi-Modal Models Test (Extended) 3 # 75min + timeout_in_minutes: 150 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 # grade: Blocking @@ -961,6 +1043,8 @@ steps: - vllm/ - tests/models/multimodal commands: + - export MIOPEN_DEBUG_CONV_DIRECT=0 + - export MIOPEN_DEBUG_CONV_GEMM=0 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' @@ -995,12 +1079,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py + # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper @@ -1044,8 +1128,9 @@ steps: - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py -- label: Blackwell Fusion Tests # 30 min +- label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 working_dir: "/vllm-workspace/" gpu: b200 @@ -1053,11 +1138,18 @@ steps: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py - vllm/compilation/ # can affect pattern matching - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi - pytest -v -s tests/compile/test_fusion_attn.py @@ -1066,7 +1158,9 @@ steps: - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -1084,19 +1178,15 @@ steps: - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - tests/compile/distributed/test_fusions_e2e.py - - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi # Run all e2e fusion tests - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile -- label: ROCm GPT-OSS Eval +- label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" - agent_pool: mi325_1 - mirror_hardwares: [amdproduction] + gpu: b200 optional: true # run on nightlies source_file_dependencies: - tests/evals/gpt_oss @@ -1105,7 +1195,7 @@ steps: - vllm/v1/attention/backends/flashinfer.py commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - label: Blackwell Quantized MoE Test timeout_in_minutes: 60 @@ -1132,7 +1222,7 @@ steps: - csrc/ - vllm/model_executor/layers/quantization commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1172,13 +1262,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 @@ -1205,6 +1295,7 @@ steps: - tests/v1/worker/test_worker_memory_snapshot.py commands: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py @@ -1240,7 +1331,7 @@ steps: - label: Plugin Tests (2 GPUs) # 40min timeout_in_minutes: 60 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_2 # grade: Blocking working_dir: "/vllm-workspace/tests" @@ -1256,7 +1347,9 @@ steps: # end platform plugin tests # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin - pip install -e ./plugins/prithvi_io_processor_plugin - - pytest -v -s plugins_tests/test_io_processor_plugins.py + # Need tf32 to avoid conflicting precision issue with terratorch on ROCm. + # TODO: Remove after next torch update + - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y # end io_processor plugins test # begin stat_logger plugins test @@ -1308,12 +1401,15 @@ steps: - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py - - pytest -v -s -x lora/test_gptoss_tp.py + + # Disabled for now because MXFP4 backend on non-cuda platform + # doesn't support LoRA yet + #- pytest -v -s -x lora/test_gptoss_tp.py - label: Weight Loading Multiple GPU Test # 33min timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_2 # grade: Blocking working_dir: "/vllm-workspace/tests" @@ -1372,7 +1468,83 @@ steps: - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py + - label: LM Eval Large Models # optional + gpu: a100 + optional: true + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +##### H100 test ##### +- label: LM Eval Large Models (H100) # optional + gpu: h100 + optional: true + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + + +##### H200 test ##### +- label: Distributed Tests (H200) # optional + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py + - pytest -v -s tests/distributed/test_context_parallel.py + - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - pytest -v -s tests/v1/distributed/test_dbo.py + +##### B200 test ##### +- label: Distributed Tests (B200) # optional + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py + +##### E2E Eval Tests ##### +- label: LM Eval Small Models (1 Card) # 15min + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + +- label: LM Eval Large Models (4 Card) mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_4 # grade: Blocking @@ -1387,50 +1559,29 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 -##### H100 test ##### -- label: LM Eval Large Models (H100) # optional - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - gpu: h100 - optional: true - num_gpus: 4 +- label: ROCm LM Eval Large Models (8 Card) + mirror_hardwares: [amdproduction] + agent_pool: mi325_8 + num_gpus: 8 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8 + +- label: ROCm GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + agent_pool: mi325_1 + mirror_hardwares: [amdexperimental, amdproduction] + optional: true # run on nightlies source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py commands: - - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - -##### H200 test ##### -- label: Distributed Tests (H200) # optional - mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 - # grade: Blocking - gpu: h200 - optional: true - working_dir: "/vllm-workspace/" - num_gpus: 2 - commands: - - pytest -v -s tests/compile/distributed/test_async_tp.py - - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - - pytest -v -s tests/v1/distributed/test_dbo.py - -##### B200 test ##### -- label: Distributed Tests (B200) # optional - gpu: b200 - optional: true - working_dir: "/vllm-workspace/" - num_gpus: 2 - commands: - - pytest -v -s tests/distributed/test_context_parallel.py - - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py - - pytest -v -s tests/v1/distributed/test_dbo.py + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 ##### RL Integration Tests ##### - label: Prime-RL Integration Test # 15min @@ -1446,9 +1597,8 @@ steps: - .buildkite/scripts/run-prime-rl-test.sh commands: - bash .buildkite/scripts/run-prime-rl-test.sh - - label: DeepSeek V2-Lite Accuracy - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_4 # grade: Blocking timeout_in_minutes: 60 @@ -1459,8 +1609,8 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 -- label: Qwen3-30B-A3B-FP8-block Accuracy - mirror_hardwares: [amdexperimental] +- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_4 # grade: Blocking timeout_in_minutes: 60 @@ -1469,4 +1619,36 @@ steps: num_gpus: 4 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020 + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 + +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) + timeout_in_minutes: 60 + gpu: b200 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + +- label: DeepSeek V2-Lite Async EPLB Accuracy + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5309581d8e81f..7b664c4fa15fe 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -57,14 +57,16 @@ steps: - pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s utils_ -- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins - timeout_in_minutes: 10 +- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min + timeout_in_minutes: 30 source_file_dependencies: - vllm/ - tests/test_inputs.py - tests/test_outputs.py - tests/multimodal - tests/standalone_tests/lazy_imports.py + - tests/tokenizers_ + - tests/tool_parsers - tests/transformers_utils - tests/config no_gpu: true @@ -73,6 +75,8 @@ steps: - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s tokenizers_ + - pytest -v -s tool_parsers - pytest -v -s transformers_utils - pytest -v -s config @@ -110,7 +114,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -128,7 +132,7 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests -- label: Entrypoints Integration Test (API Server) # 100min +- label: Entrypoints Integration Test (API Server 1) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" @@ -140,10 +144,26 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py +- label: Entrypoints Integration Test (API Server 2) + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + working_dir: "/vllm-workspace/tests" + fast_check: true + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/entrypoints/sleep + - tests/entrypoints/rpc + - tests/tool_use + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/sleep + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s tool_use + - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] @@ -192,6 +212,7 @@ steps: # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py @@ -275,21 +296,18 @@ steps: - pytest -v -s test_regression.py working_dir: "/vllm-workspace/tests" # optional -- label: Engine Test # 25min - timeout_in_minutes: 40 +- label: Engine Test # 9min + timeout_in_minutes: 15 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/engine - - tests/tokenization - tests/test_sequence - tests/test_config - tests/test_logger - tests/test_vllm_port commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py - # OOM in the CI unless we run this separately - - pytest -v -s tokenization - label: V1 Test e2e + engine # 30min timeout_in_minutes: 45 @@ -301,7 +319,10 @@ steps: # TODO: accuracy does not match, whether setting # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - pytest -v -s v1/e2e - - pytest -v -s v1/engine + # Run this test standalone for now; + # need to untangle use (implicit) use of spawn/fork across the tests. + - pytest -v -s v1/engine/test_preprocess_error_handling.py + - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py - label: V1 Test entrypoints # 35min timeout_in_minutes: 50 @@ -346,6 +367,19 @@ steps: commands: - pytest -v -s v1/attention +- label: Batch Invariance Tests (H100) # 10min + timeout_in_minutes: 25 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + - label: V1 Test attention (B200) # 10min timeout_in_minutes: 30 gpu: b200 @@ -375,23 +409,28 @@ steps: working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints + - vllm/multimodal - examples/ commands: - pip install tensorizer # for tensorizer test + # for basic + - python3 offline_inference/basic/chat.py - python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/audio_language.py --seed 0 - - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_pooling.py --seed 0 - - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py + # for multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -450,7 +489,9 @@ steps: # tests covered elsewhere. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -464,7 +505,9 @@ steps: # as it is a heavy test that is covered in other steps. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 @@ -550,6 +593,25 @@ steps: commands: - pytest -v -s kernels/mamba +- label: Kernels DeepGEMM Test (H100) + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + - label: Model Executor Test # 23min timeout_in_minutes: 35 torch_nightly: true @@ -599,7 +661,8 @@ steps: # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system torchao==0.14.1 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - label: LM Eval Small Models # 53min @@ -610,7 +673,7 @@ steps: - vllm/model_executor/layers/quantization autorun_on_main: true commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt - label: OpenAI API correctness # 22min timeout_in_minutes: 30 @@ -622,25 +685,6 @@ steps: commands: # LMEval+Transcription WER check - pytest -s entrypoints/openai/correctness/ -- label: OpenAI-Compatible Tool Use # 23 min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental] - fast_check: false - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s -m 'not cpu_test' tool_use - -- label: OpenAI-Compatible Tool Use (CPU) # 5 mins - timeout_in_minutes: 10 - source_file_dependencies: - - vllm/ - - tests/tool_use - no_gpu: true - commands: - - pytest -v -s -m 'cpu_test' tool_use - ##### models test ##### - label: Basic Models Tests (Initialization) @@ -650,6 +694,7 @@ steps: source_file_dependencies: - vllm/ - tests/models/test_initialization.py + - tests/models/registry.py commands: # Run a subset of model initialization tests - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset @@ -660,7 +705,9 @@ steps: torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ + - vllm/transformers_utils/ - tests/models/test_initialization.py + - tests/models/registry.py commands: # Only when vLLM model source is modified - test initialization of a large # subset of supported models (the complement of the small subset in the above @@ -786,14 +833,24 @@ steps: commands: - pytest -v -s models/language/pooling_mteb_test -- label: Multi-Modal Processor Test # 44min +- label: Multi-Modal Processor Test (CPU) + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + no_gpu: true + commands: + - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'" + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Processor Test timeout_in_minutes: 60 source_file_dependencies: - vllm/ - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal/processing + - pytest -v -s models/multimodal/processing/test_tensor_schema.py - label: Multi-Modal Models Test (Standard) # 60min timeout_in_minutes: 80 @@ -870,11 +927,12 @@ steps: - label: Transformers Nightly Models Test working_dir: "/vllm-workspace/" optional: true + soft_fail: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_initialization.py - pytest -v -s tests/models/test_transformers.py - # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl @@ -1015,7 +1073,7 @@ steps: - csrc/ - vllm/model_executor/layers/quantization commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1051,13 +1109,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 @@ -1084,6 +1142,7 @@ steps: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py @@ -1173,6 +1232,8 @@ steps: # FIXIT: find out which code initialize cuda before running the test # before the fix, we need to use spawn to test it - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # Alot of these tests are on the edge of OOMing + - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # There is some Tensor Parallelism related processing logic in LoRA that # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py @@ -1267,13 +1328,13 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/distributed/test_async_tp.py + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - - pytest -v -s tests/distributed/test_sequence_parallel.py + - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### @@ -1291,12 +1352,14 @@ steps: - label: Prime-RL Integration Test # 15min timeout_in_minutes: 30 optional: true + soft_fail: true num_gpus: 2 working_dir: "/vllm-workspace" source_file_dependencies: - vllm/ - .buildkite/scripts/run-prime-rl-test.sh commands: + - nvidia-smi - bash .buildkite/scripts/run-prime-rl-test.sh - label: DeepSeek V2-Lite Accuracy @@ -1308,11 +1371,20 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 -- label: Qwen3-30B-A3B-FP8-block Accuracy +- label: Qwen3-30B-A3B-FP8-block Accuracy (H100) timeout_in_minutes: 60 gpu: h100 optional: true num_gpus: 4 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020 + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 + +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) + timeout_in_minutes: 60 + gpu: b200 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml new file mode 100644 index 0000000000000..6e444eae14c74 --- /dev/null +++ b/.buildkite/test_areas/attention.yaml @@ -0,0 +1,21 @@ +group: Attention +depends_on: + - image-build +steps: +- label: V1 attention (H100) + timeout_in_minutes: 30 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - pytest -v -s v1/attention + +- label: V1 attention (B200) + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml new file mode 100644 index 0000000000000..759d2b5358714 --- /dev/null +++ b/.buildkite/test_areas/basic_correctness.yaml @@ -0,0 +1,16 @@ +group: Basic Correctness +depends_on: + - image-build +steps: +- label: Basic Correctness + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml new file mode 100644 index 0000000000000..574b642d407b0 --- /dev/null +++ b/.buildkite/test_areas/benchmarks.yaml @@ -0,0 +1,19 @@ +group: Benchmarks +depends_on: + - image-build +steps: +- label: Benchmarks + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml new file mode 100644 index 0000000000000..0ba00925a4838 --- /dev/null +++ b/.buildkite/test_areas/compile.yaml @@ -0,0 +1,57 @@ +group: Compile +depends_on: + - image-build +steps: +- label: Fusion and Compile Tests (B200) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py + commands: + - nvidia-smi + - pytest -v -s tests/compile/test_fusion_attn.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + # this runner has 2 GPUs available even though num_gpus=2 is not set + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # Wrap with quotes to escape yaml + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + +- label: Fusion E2E (2 GPUs)(B200) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + num_gpus: 2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/distributed/test_fusions_e2e.py + commands: + - nvidia-smi + # Run all e2e fusion tests + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py + diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml new file mode 100644 index 0000000000000..50c0c338c2434 --- /dev/null +++ b/.buildkite/test_areas/cuda.yaml @@ -0,0 +1,22 @@ +group: CUDA +depends_on: + - image-build +steps: +- label: Platform Tests (CUDA) + timeout_in_minutes: 15 + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + +- label: Cudagraph + timeout_in_minutes: 20 + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py \ No newline at end of file diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml new file mode 100644 index 0000000000000..65a981a9d6d00 --- /dev/null +++ b/.buildkite/test_areas/distributed.yaml @@ -0,0 +1,199 @@ +group: Distributed +depends_on: + - image-build +steps: +- label: Distributed Comm Ops + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: Distributed (2 GPUs) + timeout_in_minutes: 90 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/fullgraph/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Tests (4 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and external_dp=2 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=2 and pp=2 + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - cd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + +- label: Distributed Tests (8 GPUs)(H100) + timeout_in_minutes: 10 + gpu: h100 + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and dp=4 with ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + +- label: Distributed Tests (4 GPUs)(A100) + gpu: a100 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: Distributed Tests (2 GPUs)(H200) + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput + - pytest -v -s tests/v1/distributed/test_dbo.py + +- label: Distributed Tests (2 GPUs)(B200) + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py + +- label: 2 Node Test (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" + +- label: Distributed NixlConnector PD accuracy (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh + +- label: Pipeline + Context Parallelism (4 GPUs)) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py \ No newline at end of file diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml new file mode 100644 index 0000000000000..2e0857986c3fa --- /dev/null +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -0,0 +1,42 @@ +group: E2E Integration +depends_on: + - image-build +steps: +- label: DeepSeek V2-Lite Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-30B-A3B-FP8-block Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 + +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) + timeout_in_minutes: 60 + gpu: b200 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + +- label: Prime-RL Integration (2 GPUs) + timeout_in_minutes: 30 + optional: true + soft_fail: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml new file mode 100644 index 0000000000000..a028e0e4af4c1 --- /dev/null +++ b/.buildkite/test_areas/engine.yaml @@ -0,0 +1,26 @@ +group: Engine +depends_on: + - image-build +steps: +- label: Engine + timeout_in_minutes: 15 + source_file_dependencies: + - vllm/ + - tests/engine + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + +- label: V1 e2e + engine + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml new file mode 100644 index 0000000000000..5b16ea9c1ad07 --- /dev/null +++ b/.buildkite/test_areas/entrypoints.yaml @@ -0,0 +1,83 @@ +group: Entrypoints +depends_on: + - image-build +steps: +- label: Entrypoints Unit Tests + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration (LLM) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + +- label: Entrypoints Integration (API Server 1) + timeout_in_minutes: 130 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/test_chat_utils.py + + +- label: Entrypoints Integration (API Server 2) + timeout_in_minutes: 130 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/tool_use + - tests/entrypoints/sleep + - tests/entrypoints/instrumentator + - tests/entrypoints/rpc + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s entrypoints/instrumentator + - pytest -v -s entrypoints/sleep + - pytest -v -s tool_use + +- label: Entrypoints Integration (Pooling) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + + +- label: Entrypoints V1 + timeout_in_minutes: 50 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: OpenAI API Correctness + timeout_in_minutes: 30 + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + commands: # LMEval+Transcription WER check + - pytest -s entrypoints/openai/correctness/ diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml new file mode 100644 index 0000000000000..feb8252148c7f --- /dev/null +++ b/.buildkite/test_areas/expert_parallelism.yaml @@ -0,0 +1,23 @@ +group: Expert Parallelism +depends_on: + - image-build +steps: +- label: EPLB Algorithm + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py \ No newline at end of file diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml new file mode 100644 index 0000000000000..7ca099516d641 --- /dev/null +++ b/.buildkite/test_areas/kernels.yaml @@ -0,0 +1,117 @@ +group: Kernels +depends_on: + - image-build +steps: +- label: Kernels Core Operation Test + timeout_in_minutes: 75 + source_file_dependencies: + - csrc/ + - tests/kernels/core + - tests/kernels/test_top_k_per_row.py + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + +- label: Kernels Attention Test %N + timeout_in_minutes: 35 + source_file_dependencies: + - csrc/attention/ + - vllm/attention + - vllm/v1/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N + timeout_in_minutes: 90 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test %N + timeout_in_minutes: 60 + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Mamba Test + timeout_in_minutes: 45 + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Kernels DeepGEMM Test (H100) + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + +- label: Kernels (B200) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/attention/backends/mla/cutlass_mla.py + - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/platforms/cuda.py + - vllm/attention/selector.py + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_attention_selector.py + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py \ No newline at end of file diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml new file mode 100644 index 0000000000000..e2498512bdef7 --- /dev/null +++ b/.buildkite/test_areas/lm_eval.yaml @@ -0,0 +1,46 @@ +group: LM Eval +depends_on: + - image-build +steps: +- label: LM Eval Small Models + timeout_in_minutes: 75 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + autorun_on_main: true + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt + +- label: LM Eval Large Models (4 GPUs)(A100) + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: LM Eval Large Models (4 GPUs)(H100) + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + +- label: LM Eval Small Models (B200) + timeout_in_minutes: 120 + gpu: b200 + optional: true + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml new file mode 100644 index 0000000000000..59ade40cc8f52 --- /dev/null +++ b/.buildkite/test_areas/lora.yaml @@ -0,0 +1,33 @@ +group: LoRA +depends_on: + - image-build +steps: +- label: LoRA %N + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py + parallelism: 4 + + +- label: LoRA TP (Distributed) + timeout_in_minutes: 30 + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # Alot of these tests are on the edge of OOMing + - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py \ No newline at end of file diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml new file mode 100644 index 0000000000000..252af1e56a105 --- /dev/null +++ b/.buildkite/test_areas/misc.yaml @@ -0,0 +1,165 @@ +group: Miscellaneous +depends_on: + - image-build +steps: +- label: V1 Others + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + # split the test to avoid interference + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Others (CPU) + depends_on: ~ + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + +- label: Regression + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: Examples + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - vllm/multimodal + - examples/ + commands: + - pip install tensorizer # for tensorizer test + - python3 offline_inference/basic/chat.py # for basic + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + # for multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + +- label: Metrics, Tracing (2 GPUs) + timeout_in_minutes: 20 + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +- label: Python-only Installation + depends_on: ~ + timeout_in_minutes: 20 + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Async Engine, Inputs, Utils, Worker + timeout_in_minutes: 50 + source_file_dependencies: + - vllm/ + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker, Config (CPU) + depends_on: ~ + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/multimodal + - tests/standalone_tests/lazy_imports.py + - tests/tokenizers_ + - tests/tool_parsers + - tests/transformers_utils + - tests/config + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s tokenizers_ + - pytest -v -s tool_parsers + - pytest -v -s transformers_utils + - pytest -v -s config + +- label: GPT-OSS Eval (B200) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Batch Invariance (H100) + timeout_in_minutes: 25 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py \ No newline at end of file diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml new file mode 100644 index 0000000000000..996c8bb8b780a --- /dev/null +++ b/.buildkite/test_areas/model_executor.yaml @@ -0,0 +1,17 @@ +group: Model Executor +depends_on: + - image-build +steps: +- label: Model Executor + timeout_in_minutes: 35 + source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml new file mode 100644 index 0000000000000..2a86596a6d603 --- /dev/null +++ b/.buildkite/test_areas/models_basic.yaml @@ -0,0 +1,64 @@ +group: Models - Basic +depends_on: + - image-build +steps: +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + - tests/models/registry.py + commands: + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py + - tests/models/registry.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + no_gpu: true + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Transformers Nightly Models + working_dir: "/vllm-workspace/" + optional: true + soft_fail: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/offline_inference/basic/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml new file mode 100644 index 0000000000000..b6bfbf2ddab47 --- /dev/null +++ b/.buildkite/test_areas/models_distributed.yaml @@ -0,0 +1,22 @@ +group: Models - Distributed +depends_on: + - image-build +steps: +- label: Distributed Model Tests (2 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml new file mode 100644 index 0000000000000..f70192c4ebc0a --- /dev/null +++ b/.buildkite/test_areas/models_language.yaml @@ -0,0 +1,91 @@ +group: Models - Language +depends_on: + - image-build +steps: +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + # Test standard language models, excluding a subset of slow tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + # Shard hybrid language model tests + - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml new file mode 100644 index 0000000000000..fc24068c20a46 --- /dev/null +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -0,0 +1,79 @@ +group: Models - Multimodal +depends_on: + - image-build +steps: +- label: Multi-Modal Models (Standard) # 60min + timeout_in_minutes: 80 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Processor Test (CPU) + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + no_gpu: true + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Processor # 44min + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Accuracy Eval (Small Models) # 50min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + +- label: Multi-Modal Models (Extended) 1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models (Extended) 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models (Extended) 3 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +# This test is used only in PR development phase to test individual models and should never run on main +- label: Custom Models + optional: true + commands: + - echo 'Testing custom models...' + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml new file mode 100644 index 0000000000000..60c179aa098e1 --- /dev/null +++ b/.buildkite/test_areas/plugins.yaml @@ -0,0 +1,34 @@ +group: Plugins +depends_on: + - image-build +steps: +- label: Plugin Tests (2 GPUs) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml new file mode 100644 index 0000000000000..332d5202d8338 --- /dev/null +++ b/.buildkite/test_areas/pytorch.yaml @@ -0,0 +1,52 @@ +group: PyTorch +depends_on: + - image-build +steps: +- label: PyTorch Compilation Unit Tests + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" + +- label: PyTorch Fullgraph Smoke Test + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;" + +- label: PyTorch Fullgraph + timeout_in_minutes: 40 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # fp8 kv scales not supported on sm89, tested on Blackwell instead + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + # Limit to no custom ops to reduce running time + # Wrap with quotes to escape yaml and avoid starting -k string with a - + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + +- label: Pytorch Nightly Dependency Override Check # 2min + # if this test fails, it means the nightly torch version is not compatible with some + # of the dependencies. Please check the error message and add the package to whitelist + # in /vllm/tools/pre_commit/generate_nightly_torch_test.py + soft_fail: true + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh \ No newline at end of file diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml new file mode 100644 index 0000000000000..6e89d6af3b8d1 --- /dev/null +++ b/.buildkite/test_areas/quantization.yaml @@ -0,0 +1,46 @@ +group: Quantization +depends_on: + - image-build +steps: +- label: Quantization + timeout_in_minutes: 90 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system conch-triton-kernels + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + +- label: Quantized MoE Test (B200) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + +- label: Quantized Models Test + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml new file mode 100644 index 0000000000000..ad377148fd073 --- /dev/null +++ b/.buildkite/test_areas/samplers.yaml @@ -0,0 +1,14 @@ +group: Samplers +depends_on: + - image-build +steps: +- label: Samplers Test + timeout_in_minutes: 75 + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml new file mode 100644 index 0000000000000..cfc5bb20fe7ad --- /dev/null +++ b/.buildkite/test_areas/weight_loading.yaml @@ -0,0 +1,25 @@ +group: Weight Loading +depends_on: + - image-build +steps: +- label: Weight Loading Multiple GPU # 33min + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + +- label: Weight Loading Multiple GPU - Large Models # optional + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 6e178bb690c56..4d7a366f05e37 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -9,11 +9,13 @@ /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/mamba @tdoublep /vllm/model_executor/model_loader @22quinn +/vllm/model_executor/layers/batch_invariant.py @yewentao256 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa /vllm/vllm_flash_attn @LucasWilkinson /vllm/lora @jeejeelee /vllm/reasoning @aarnphm @chaunceyjiang /vllm/entrypoints @aarnphm @chaunceyjiang +/vllm/tool_parsers @aarnphm @chaunceyjiang /vllm/compilation @zou3519 @youkaichao @ProExpertProg /vllm/distributed/kv_transfer @NickLucche @ApostaC CMakeLists.txt @tlrmchlsmth @LucasWilkinson @@ -35,6 +37,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/offloading @ApostaC +# Model runner V2 +/vllm/v1/worker/gpu @WoosukKwon + # Test ownership /.buildkite/lm-eval-harness @mgoin /tests/distributed/test_multi_node_assignment.py @youkaichao @@ -56,6 +61,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/v1/kv_connector/nixl_integration @NickLucche /tests/v1/kv_connector @ApostaC /tests/v1/offloading @ApostaC +/tests/v1/determinism @yewentao256 # Transformers modeling backend /vllm/model_executor/models/transformers @hmellor @@ -141,9 +147,10 @@ mkdocs.yaml @hmellor /requirements/kv_connectors.txt @NickLucche # Pooling models -/examples/*/pooling/ @noooop +/examples/pooling @noooop /tests/models/*/pooling* @noooop /tests/entrypoints/pooling @noooop +/vllm/entrypoints/pooling @noooop /vllm/config/pooler.py @noooop /vllm/pooling_params.py @noooop /vllm/model_executor/layers/pooler.py @noooop diff --git a/.github/mergify.yml b/.github/mergify.yml index 997a40e18e588..61a03135be395 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -14,6 +14,52 @@ pull_request_rules: comment: message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/" +- name: comment-pre-commit-failure + description: Comment on PR when pre-commit check fails + conditions: + - status-failure=pre-commit + - -closed + - -draft + actions: + comment: + message: | + Hi @{{author}}, the pre-commit checks have failed. Please run: + + ```bash + uv pip install pre-commit + pre-commit install + pre-commit run --all-files + ``` + + Then, commit the changes and push to your branch. + + For future commits, `pre-commit` will run automatically on changed files before each commit. + + > [!TIP] + >
+ > Is mypy or markdownlint failing? + >
+ > mypy and markdownlint are run differently in CI. If the failure is related to either of these checks, please use the following commands to run them locally: + > + > ```bash + > # For mypy (substitute "3.10" with the failing version if needed) + > pre-commit run --hook-stage manual mypy-3.10 + > # For markdownlint + > pre-commit run --hook-stage manual markdownlint + > ``` + >
+ +- name: comment-dco-failure + description: Comment on PR when DCO check fails + conditions: + - status-failure=dco + - -closed + - -draft + actions: + comment: + message: | + Hi @{{author}}, the DCO check has failed. Please click on DCO in the Checks section for instructions on how to resolve this. + - name: label-ci-build description: Automatically apply ci/build label conditions: @@ -140,7 +186,7 @@ pull_request_rules: - files~=^tests/entrypoints/test_context.py - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py - - files~=^vllm/entrypoints/harmony_utils.py + - files~=^vllm/entrypoints/openai/parser/harmony_utils.py - files~=^vllm/entrypoints/tool_server.py - files~=^vllm/entrypoints/tool.py - files~=^vllm/entrypoints/context.py @@ -189,6 +235,20 @@ pull_request_rules: add: - rocm +- name: label-cpu + description: Automatically apply cpu label + conditions: + - label != stale + - files~=^(?!.*kv_offload)(?!.*cpu_offload).*\bcpu.* + actions: + label: + add: + - cpu + assign: + users: + - "fadara01" + - "aditew01" + - name: label-structured-output description: Automatically apply structured-output label conditions: @@ -289,6 +349,18 @@ pull_request_rules: add: - tool-calling +- name: auto-rebase if approved, ready, and 40 commits behind main + conditions: + - base = main + - label=ready + - "#approved-reviews-by >= 1" + - "#commits-behind >= 40" + - -closed + - -draft + - -conflict + actions: + rebase: {} + - name: ping author on conflicts and add 'needs-rebase' label conditions: - label != stale @@ -358,4 +430,4 @@ pull_request_rules: actions: label: add: - - kv-connector \ No newline at end of file + - kv-connector diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index c3e132a536a42..df8910837715d 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -13,10 +13,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - name: Set up Python - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: '3.12' diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml index 7d565ef9f2e45..629966b959330 100644 --- a/.github/workflows/issue_autolabel.yml +++ b/.github/workflows/issue_autolabel.yml @@ -105,6 +105,31 @@ jobs: } ], }, + cpu: { + // Keyword search - matches whole words only (with word boundaries) + keywords: [ + { + term: "CPU Backend", + searchIn: "title" + }, + { + term: "x86", + searchIn: "title" + }, + { + term: "ARM", + searchIn: "title" + }, + { + term: "Apple Silicon", + searchIn: "title" + }, + { + term: "IBM Z", + searchIn: "title" + }, + ], + }, // Add more label configurations here as needed // example: { // keywords: [...], diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index a183033c9adde..e80a5c0cc80f9 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -12,7 +12,7 @@ jobs: timeout-minutes: 30 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6.0.1 - uses: astral-sh/setup-uv@v7 with: diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index e21d13b8161f3..1041653c2f57e 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -16,8 +16,8 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: "3.12" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index dca3089f496c9..44bf71db5e9de 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -7,13 +7,15 @@ on: jobs: close-issues-and-pull-requests: + # Prevents triggering on forks or other repos + if: github.repository == 'vllm-project/vllm' permissions: issues: write pull-requests: write actions: write runs-on: ubuntu-latest steps: - - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0 + - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1 with: # Increasing this value ensures that changes to this workflow # propagate to all issues and PRs in days rather than months diff --git a/CMakeLists.txt b/CMakeLists.txt index ae8e6175443f3..c46fb18d7bfef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,8 +56,8 @@ endif() # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0") -set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1") # # Try to find python package with an executable that exactly matches @@ -136,7 +136,7 @@ elseif(HIP_FOUND) # ROCm 5.X and 6.X if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND - NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM}) + Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM}) message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} " "expected for ROCm build, saw ${Torch_VERSION} instead.") endif() @@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. - set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use") + set(CUTLASS_REVISION "v4.2.1") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) @@ -354,9 +354,22 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build Marlin kernels if we are building for at least some compatible archs. # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. - # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") - if (MARLIN_ARCHS) + + # marlin arches for fp16 output + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}") + # marlin has limited support for turing + cuda_archs_loose_intersection(MARLIN_SM75_ARCHS "7.5" "${CUDA_ARCHS}") + # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX) + cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") + # marlin arches for fp8 input + # - sm80 doesn't support fp8 computation + # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction + # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0) + cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}") + # marlin arches for other files + cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}") + + if (MARLIN_OTHER_ARCHS) # # For the Marlin kernels we automatically generate sources for various @@ -365,16 +378,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set(MARLIN_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py) file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH) + list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR) + set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})") - message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}") - message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}") + message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}") + message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}") - if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH} - OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH}) + if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} + OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}) execute_process( COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=$PYTHONPATH - ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} + PYTHONPATH=$ENV{PYTHONPATH} + ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR} RESULT_VARIABLE marlin_generation_result OUTPUT_VARIABLE marlin_generation_result OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log @@ -387,40 +402,76 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "\nCheck the log for details: " "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log") else() - set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH} - CACHE STRING "Last run Marlin generate script hash" FORCE) + set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH} + CACHE STRING "Last run Marlin generate script hash and arch" FORCE) message(STATUS "Marlin generation completed successfully.") endif() else() message(STATUS "Marlin generation script has not changed, skipping generation.") endif() - file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu") - set_gencode_flags_for_srcs( - SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" - CUDA_ARCHS "${MARLIN_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) - set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC} - PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + if (MARLIN_ARCHS) + file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}" + CUDA_ARCHS "${MARLIN_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) + + file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}" + CUDA_ARCHS "${MARLIN_BF16_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC}) endif() - list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC}) + if (MARLIN_SM75_ARCHS) + file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}" + CUDA_ARCHS "${MARLIN_SM75_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_TEMPLATE_SM75_KERNEL_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC}) + endif() + + if (MARLIN_FP8_ARCHS) + file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}" + CUDA_ARCHS "${MARLIN_FP8_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC}) + endif() set(MARLIN_SRCS "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" "csrc/quantization/gptq_marlin/gptq_marlin.cu" + "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/quantization/gptq_marlin/awq_marlin_repack.cu") set_gencode_flags_for_srcs( SRCS "${MARLIN_SRCS}" - CUDA_ARCHS "${MARLIN_ARCHS}") + CUDA_ARCHS "${MARLIN_OTHER_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) - set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu" + set_source_files_properties(${MARLIN_SRCS} PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") endif() list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}") - message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") + message(STATUS "Building Marlin kernels for archs: ${MARLIN_OTHER_ARCHS}") else() message(STATUS "Not building Marlin kernels as no compatible archs found" " in CUDA target architectures") @@ -604,12 +655,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set(SRCS "csrc/quantization/fp4/nvfp4_quant_kernels.cu" "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu" - "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu") + "csrc/quantization/fp4/nvfp4_experts_quant.cu" + "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu" + "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu") set_gencode_flags_for_srcs( SRCS "${SRCS}" CUDA_ARCHS "${FP4_ARCHS}") list(APPEND VLLM_EXT_SRC "${SRCS}") list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1") message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}") else() message(STATUS "Not building NVFP4 as no compatible archs were found.") @@ -745,24 +799,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") else() cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}") endif() - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) - set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu") - set_gencode_flags_for_srcs( - SRCS "${SRCS}" - CUDA_ARCHS "${SCALED_MM_ARCHS}") - list(APPEND VLLM_EXT_SRC "${SRCS}") - list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1") - message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}") - else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) - message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is " - "not >= 12.8, we recommend upgrading to CUDA 12.8 or later " - "if you intend on running FP8 quantized MoE models on Blackwell.") - else() - message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found " - "in CUDA target architectures") - endif() - endif() # # Machete kernels @@ -786,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH}) execute_process( COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH + PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH} ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT} RESULT_VARIABLE machete_generation_result OUTPUT_VARIABLE machete_generation_output @@ -838,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS) set(SRCS - "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu") + "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu" + "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu" + "csrc/quantization/cutlass_w4a8/w4a8_utils.cu" + ) set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -908,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp" "csrc/moe/moe_align_sum_kernels.cu" - "csrc/moe/moe_lora_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") @@ -938,9 +976,20 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") CUDA_ARCHS "${CUDA_ARCHS}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") - # 9.0 for latest bf16 atomicAdd PTX - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}") - if (MARLIN_MOE_ARCHS) + # moe marlin arches + # note that we always set `use_atomic_add=False` for moe marlin now, + # so we don't need 9.0 for bf16 atomicAdd PTX + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}") + # moe marlin has limited support for turing + cuda_archs_loose_intersection(MARLIN_MOE_SM75_ARCHS "7.5" "${CUDA_ARCHS}") + # moe marlin arches for fp8 input + # - sm80 doesn't support fp8 computation + # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction + # so we only enable fp8 computation for SM89 (e.g. RTX 40x0) and 12.0 (e.g. RTX 50x0) + cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}") + # moe marlin arches for other files + cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}") + if (MARLIN_MOE_OTHER_ARCHS) # # For the Marlin MOE kernels we automatically generate sources for various @@ -949,16 +998,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") set(MOE_MARLIN_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py) file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH) + list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR) + set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})") - message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}") - message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}") + message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}") + message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}") - if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} - OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH}) + if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} + OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}) execute_process( COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=$PYTHONPATH - ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} + PYTHONPATH=$ENV{PYTHONPATH} + ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR} RESULT_VARIABLE moe_marlin_generation_result OUTPUT_VARIABLE moe_marlin_generation_output OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log @@ -971,7 +1022,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "\nCheck the log for details: " "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log") else() - set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH} + set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} CACHE STRING "Last run Marlin MOE generate script hash" FORCE) message(STATUS "Marlin MOE generation completed successfully.") endif() @@ -979,18 +1030,53 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Marlin MOE generation script has not changed, skipping generation.") endif() - file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu") - set_gencode_flags_for_srcs( - SRCS "${MOE_WNAA16_MARLIN_SRC}" - CUDA_ARCHS "${MARLIN_MOE_ARCHS}") - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) - set_source_files_properties(${MOE_WNAA16_MARLIN_SRC} - PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + if (MARLIN_MOE_ARCHS) + file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_MOE_SRC}" + CUDA_ARCHS "${MARLIN_MOE_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_MOE_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC}) endif() - list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC}) + if (MARLIN_MOE_SM75_ARCHS) + file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_MOE_SM75_SRC}" + CUDA_ARCHS "${MARLIN_MOE_SM75_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_MOE_SM75_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SM75_SRC}) + endif() - message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") + if (MARLIN_MOE_FP8_ARCHS) + file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_MOE_FP8_SRC}" + CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_MOE_FP8_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC}) + endif() + + set(MARLIN_MOE_OTHER_SRC "csrc/moe/marlin_moe_wna16/ops.cu") + set_gencode_flags_for_srcs( + SRCS "${MARLIN_MOE_OTHER_SRC}" + CUDA_ARCHS "${MARLIN_MOE_OTHER_ARCHS}") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + set_source_files_properties(${MARLIN_MOE_OTHER_SRC} + PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false") + endif() + list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_OTHER_SRC}") + + message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_OTHER_ARCHS}") else() message(STATUS "Not building Marlin MOE kernels as no compatible archs found" " in CUDA target architectures") diff --git a/README.md b/README.md index 033e1035d8916..26222b815370d 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio *Latest News* 🔥 +- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing). - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI) - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link). - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6). @@ -136,16 +137,19 @@ Compute Resources: - Alibaba Cloud - AMD - Anyscale +- Arm - AWS - Crusoe Cloud - Databricks - DeepInfra - Google Cloud +- IBM - Intel - Lambda Lab - Nebius - Novita AI - NVIDIA +- Red Hat - Replicate - Roblox - RunPod diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md index d1bdb4c43f10b..9a9600e08dafe 100644 --- a/benchmarks/auto_tune/README.md +++ b/benchmarks/auto_tune/README.md @@ -83,7 +83,7 @@ MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number ``` -#### 2. Maximize Throughput with a Latency Requirement +### 2. Maximize Throughput with a Latency Requirement - **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms. - **Configuration**: @@ -96,7 +96,7 @@ MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=500 ``` -#### 3. Maximize Throughput with Prefix Caching and Latency Requirements +### 3. Maximize Throughput with Prefix Caching and Latency Requirements - **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms. - **Configuration**: diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 56b721cbb4021..a245e2022e605 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0} MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000} NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"} NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"} +HOSTNAME=$(hostname) +if [[ -z "$HOSTNAME" ]]; then + echo "Error: Failed to determine hostname." >&2 + exit 1 +fi LOG_FOLDER="$BASE/auto-benchmark/$TAG" RESULT="$LOG_FOLDER/result.txt" @@ -82,6 +87,7 @@ start_server() { "$MODEL" "--disable-log-requests" "--port" "8004" + "--host" "$HOSTNAME" "--gpu-memory-utilization" "$gpu_memory_utilization" "--max-num-seqs" "$max_num_seqs" "--max-num-batched-tokens" "$max_num_batched_tokens" @@ -96,8 +102,9 @@ start_server() { # This correctly passes each element as a separate argument. if [[ -n "$profile_dir" ]]; then # Start server with profiling enabled - VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ - vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & + local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}" + VLLM_SERVER_DEV_MODE=1 \ + vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 & else # Start server without profiling VLLM_SERVER_DEV_MODE=1 \ @@ -112,7 +119,7 @@ start_server() { # since that we should always have permission to send signal to the server process. kill -0 $server_pid 2> /dev/null || break - RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout) + RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout) STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) if [[ "$STATUS_CODE" -eq 200 ]]; then server_started=1 @@ -172,6 +179,7 @@ run_benchmark() { --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --num-prompts 1000 \ --random-prefix-len $prefix_len \ + --host "$HOSTNAME" \ --port 8004 &> "$bm_log" throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') @@ -187,7 +195,7 @@ run_benchmark() { request_rate=$((${throughput%.*} + 1)) while ((request_rate > 0)); do # clear prefix cache - curl -X POST http://0.0.0.0:8004/reset_prefix_cache + curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache sleep 5 bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" vllm bench serve \ @@ -203,6 +211,7 @@ run_benchmark() { --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --num-prompts 100 \ --random-prefix-len $prefix_len \ + --host "$HOSTNAME" \ --port 8004 &> "$bm_log" throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') @@ -303,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --num-prompts 100 \ --random-prefix-len $prefix_len \ + --host "$HOSTNAME" \ --port 8004 \ --profile &> "$bm_log" else diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 4021fede72153..831b76b66e096 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -620,7 +620,7 @@ def get_tokenizer( kwargs["use_fast"] = False if tokenizer_mode == "mistral": try: - from vllm.transformers_utils.tokenizer import MistralTokenizer + from vllm.tokenizers.mistral import MistralTokenizer except ImportError as e: raise ImportError( "MistralTokenizer requires vllm package.\n" diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py index b5c16c42de467..7473a41e51406 100755 --- a/benchmarks/benchmark_batch_invariance.py +++ b/benchmarks/benchmark_batch_invariance.py @@ -104,7 +104,6 @@ def run_benchmark_with_batch_invariant( random.seed(seed) # Set environment variables - os.environ["VLLM_ATTENTION_BACKEND"] = backend if batch_invariant: os.environ["VLLM_BATCH_INVARIANT"] = "1" else: @@ -140,6 +139,7 @@ def run_benchmark_with_batch_invariant( max_model_len=max_model_len, dtype="bfloat16", tensor_parallel_size=tp_size, + attention_config={"backend": backend}, enable_prefix_caching=False, ) init_time = time.perf_counter() - start_init diff --git a/benchmarks/benchmark_hash.py b/benchmarks/benchmark_hash.py new file mode 100644 index 0000000000000..08cdc012d6527 --- /dev/null +++ b/benchmarks/benchmark_hash.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Micro benchmark comparing built-in hash(), SHA-256, and xxHash. + +This focuses on a single test payload shaped like the prefix-cache hash input: + (32-byte bytes object, 32-int tuple) + +Usage: + python benchmarks/hash_micro_benchmark.py --iterations 20000 +""" + +from __future__ import annotations + +import argparse +import random +import statistics +import time +from collections.abc import Callable, Iterable + +from vllm.utils.hashing import sha256, xxhash + + +def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]: + """Generate a deterministic test payload.""" + random.seed(seed) + bytes_data = bytes(random.getrandbits(8) for _ in range(32)) + int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32)) + return (bytes_data, int_tuple) + + +def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int): + """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times.""" + times: list[float] = [] + + # Warm-up to avoid first-run noise. + for _ in range(200): + func(data) + + for _ in range(iterations): + start = time.perf_counter() + func(data) + end = time.perf_counter() + times.append(end - start) + + avg = statistics.mean(times) + std = statistics.stdev(times) if len(times) > 1 else 0.0 + return avg, std + + +def _run_benchmarks( + benchmarks: Iterable[tuple[str, Callable[[tuple], object]]], + data: tuple, + iterations: int, +): + """Yield (name, avg, std) for each benchmark, skipping unavailable ones.""" + for name, func in benchmarks: + try: + avg, std = _benchmark_func(func, data, iterations) + except ModuleNotFoundError as exc: + print(f"Skipping {name}: {exc}") + continue + yield name, avg, std + + +def builtin_hash(data: tuple) -> int: + """Wrapper for Python's built-in hash().""" + return hash(data) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--iterations", + type=int, + default=10_000, + help="Number of measured iterations per hash function.", + ) + parser.add_argument( + "--seed", type=int, default=42, help="Random seed for test payload." + ) + args = parser.parse_args() + + data = _generate_test_data(args.seed) + benchmarks = ( + ("SHA256 (pickle)", sha256), + ("xxHash (pickle)", xxhash), + ("built-in hash()", builtin_hash), + ) + + print("=" * 60) + print("HASH FUNCTION MICRO BENCHMARK") + print("=" * 60) + print("Test data: (32-byte bytes object, 32-int tuple)") + print(f"Iterations: {args.iterations:,}") + print("=" * 60) + + results = list(_run_benchmarks(benchmarks, data, args.iterations)) + builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None) + + print("\nResults:") + for name, avg, std in results: + print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs") + + if builtin_entry: + _, builtin_avg, _ = builtin_entry + print("\n" + "=" * 60) + print("SUMMARY (relative to built-in hash())") + print("=" * 60) + for name, avg, _ in results: + if name == "built-in hash()": + continue + speed_ratio = avg / builtin_avg + print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()") + else: + print("\nBuilt-in hash() result missing; cannot compute speed ratios.") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index dedb564fffac8..b5373d383b548 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -32,12 +32,11 @@ def benchmark_propose(args): model_config = ModelConfig( model="facebook/opt-125m", - task="generate", max_model_len=args.num_token + args.num_spec_token, tokenizer="facebook/opt-125m", tokenizer_mode="auto", dtype="auto", - seed=None, + seed=0, trust_remote_code=False, ) proposer = NgramProposer( @@ -108,7 +107,10 @@ def benchmark_batched_propose(args): device_config=DeviceConfig(device=current_platform.device_type), parallel_config=ParallelConfig(), load_config=LoadConfig(), - scheduler_config=SchedulerConfig(), + scheduler_config=SchedulerConfig( + max_model_len=model_config.max_model_len, + is_encoder_decoder=model_config.is_encoder_decoder, + ), ) # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group diff --git a/benchmarks/benchmark_prefix_block_hash.py b/benchmarks/benchmark_prefix_block_hash.py new file mode 100644 index 0000000000000..8bcd8af0d3102 --- /dev/null +++ b/benchmarks/benchmark_prefix_block_hash.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Simple benchmark to compare prefix-cache block hashing algorithms. + +Example: + python benchmark_prefix_block_hash.py --num-blocks 20000 --block-size 32 +""" + +from __future__ import annotations + +import argparse +import random +import statistics +import sys +import time +from collections.abc import Callable, Iterable, Sequence + +from vllm.utils.hashing import get_hash_fn_by_name +from vllm.v1.core.kv_cache_utils import BlockHash, hash_block_tokens, init_none_hash + +SUPPORTED_ALGOS = ("sha256", "sha256_cbor", "xxhash", "xxhash_cbor") + + +def _generate_blocks( + num_blocks: int, block_size: int, vocab_size: int, seed: int +) -> list[list[int]]: + rng = random.Random(seed) + return [ + [rng.randrange(vocab_size) for _ in range(block_size)] + for _ in range(num_blocks) + ] + + +def _hash_all_blocks( + hash_fn: Callable[[object], bytes], + blocks: Iterable[Sequence[int]], +) -> float: + parent_hash: BlockHash | None = None + start = time.perf_counter() + for block in blocks: + parent_hash = hash_block_tokens(hash_fn, parent_hash, block, extra_keys=None) + end = time.perf_counter() + return end - start + + +def _benchmark( + hash_algo: str, + blocks: list[list[int]], + trials: int, +) -> tuple[float, float, float] | None: + try: + hash_fn = get_hash_fn_by_name(hash_algo) + init_none_hash(hash_fn) + timings = [_hash_all_blocks(hash_fn, blocks) for _ in range(trials)] + except ModuleNotFoundError as exc: + print(f"Skipping {hash_algo}: {exc}", file=sys.stderr) + return None + + avg = statistics.mean(timings) + best = min(timings) + # throughput: tokens / second + tokens_hashed = len(blocks) * len(blocks[0]) + throughput = tokens_hashed / best + return avg, best, throughput + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--num-blocks", type=int, default=10000, help="Block count.") + parser.add_argument("--block-size", type=int, default=32, help="Tokens per block.") + parser.add_argument( + "--vocab-size", type=int, default=32000, help="Token id range [0, vocab_size)." + ) + parser.add_argument("--seed", type=int, default=0, help="Random seed.") + parser.add_argument( + "--trials", type=int, default=5, help="Number of timed trials per algorithm." + ) + parser.add_argument( + "--algorithms", + nargs="+", + default=SUPPORTED_ALGOS, + choices=SUPPORTED_ALGOS, + help="Hash algorithms to benchmark.", + ) + args = parser.parse_args() + + blocks = _generate_blocks( + args.num_blocks, args.block_size, args.vocab_size, args.seed + ) + print( + f"Benchmarking {len(args.algorithms)} algorithms on " + f"{args.num_blocks} blocks (block size={args.block_size})." + ) + + for algo in args.algorithms: + result = _benchmark(algo, blocks, args.trials) + if result is None: + continue + + avg, best, throughput = result + print( + f"{algo:14s} avg: {avg:.6f}s best: {best:.6f}s " + f"throughput: {throughput / 1e6:.2f}M tokens/s" + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 28fc383a318dd..e6391134ff932 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs from vllm.utils.argparse_utils import FlexibleArgumentParser try: - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer except ImportError: from backend_request_func import get_tokenizer diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 55001cf3722a0..33aca831883aa 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase try: - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer except ImportError: from backend_request_func import get_tokenizer @@ -574,7 +574,7 @@ async def benchmark( ) print( "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput + "Total token throughput (tok/s):", metrics.total_token_throughput ) ) @@ -963,8 +963,7 @@ def create_argument_parser(): parser.add_argument( "--profile", action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", + help="Use vLLM Profiling. --profiler-config must be provided on the server.", ) parser.add_argument( "--result-dir", diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index 904f805349148..d072c03c440b2 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -5,11 +5,12 @@ import argparse import asyncio import logging import os +import time +import uuid +from urllib.parse import urlparse import aiohttp from quart import Quart, Response, make_response, request -from rate_limiter import RateLimiter -from request_queue import RequestQueue # Configure logging logging.basicConfig(level=logging.INFO) @@ -24,26 +25,8 @@ def parse_args(): parser.add_argument( "--timeout", type=float, - default=300, - help="Timeout for backend service requests in seconds (default: 300)", - ) - parser.add_argument( - "--max-concurrent", - type=int, - default=100, - help="Maximum concurrent requests to backend services (default: 100)", - ) - parser.add_argument( - "--queue-size", - type=int, - default=500, - help="Maximum number of requests in the queue (default: 500)", - ) - parser.add_argument( - "--rate-limit", - type=int, - default=40, - help="Maximum requests per second (default: 40)", + default=6 * 60 * 60, + help="Timeout for backend service requests in seconds (default: 21600)", ) parser.add_argument( "--port", @@ -54,14 +37,32 @@ def parse_args(): parser.add_argument( "--prefill-url", type=str, - default="http://localhost:8100/v1/completions", - help="Prefill service endpoint URL", + default="http://localhost:8100", + help="Prefill service base URL (protocol + host[:port])", ) parser.add_argument( "--decode-url", type=str, - default="http://localhost:8200/v1/completions", - help="Decode service endpoint URL", + default="http://localhost:8200", + help="Decode service base URL (protocol + host[:port])", + ) + parser.add_argument( + "--kv-host", + type=str, + default="localhost", + help="Hostname or IP used by KV transfer (default: localhost)", + ) + parser.add_argument( + "--prefill-kv-port", + type=int, + default=14579, + help="Prefill KV port (default: 14579)", + ) + parser.add_argument( + "--decode-kv-port", + type=int, + default=14580, + help="Decode KV port (default: 14580)", ) return parser.parse_args() @@ -73,70 +74,129 @@ def main(): # Initialize configuration using command line parameters AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout) - MAX_CONCURRENT_REQUESTS = args.max_concurrent - REQUEST_QUEUE_SIZE = args.queue_size - RATE_LIMIT = args.rate_limit PREFILL_SERVICE_URL = args.prefill_url DECODE_SERVICE_URL = args.decode_url PORT = args.port + PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}" + DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}" + + logger.info( + "Proxy resolved KV addresses -> prefill: %s, decode: %s", + PREFILL_KV_ADDR, + DECODE_KV_ADDR, + ) + app = Quart(__name__) - # Initialize the rate limiter and request queue - rate_limiter = RateLimiter(RATE_LIMIT) - request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE) - - # Attach the configuration object to the application instance + # Attach the configuration object to the application instance so helper + # coroutines can read the resolved backend URLs and timeouts without using + # globals. app.config.update( { "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT, - "rate_limiter": rate_limiter, - "request_queue": request_queue, "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL, "DECODE_SERVICE_URL": DECODE_SERVICE_URL, + "PREFILL_KV_ADDR": PREFILL_KV_ADDR, + "DECODE_KV_ADDR": DECODE_KV_ADDR, } ) - # Start queue processing on app startup - @app.before_serving - async def startup(): - """Start request processing task when app starts serving""" - asyncio.create_task(request_queue.process()) + def _normalize_base_url(url: str) -> str: + """Remove any trailing slash so path joins behave predictably.""" + return url.rstrip("/") - async def forward_request(url, data): - """Forward request to backend service with rate limiting and error handling""" - headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + def _get_host_port(url: str) -> str: + """Return the hostname:port portion for logging and KV headers.""" + parsed = urlparse(url) + host = parsed.hostname or "localhost" + port = parsed.port + if port is None: + port = 80 if parsed.scheme == "http" else 443 + return f"{host}:{port}" - # Use rate limiter as context manager - async with ( - rate_limiter, - aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, - ): - try: - async with session.post( - url=url, json=data, headers=headers - ) as response: - if response.status == 200: - # Stream response chunks - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes - else: - # Handle backend service errors - error_text = await response.text() - logger.error( - "Backend service error: %s - %s", - response.status, - error_text, - ) - yield b'{"error": "Backend service error"}' - except aiohttp.ClientError as e: - # Handle connection errors - logger.error("Connection error to %s: %s", url, str(e)) - yield b'{"error": "Service unavailable"}' - except asyncio.TimeoutError: - # Handle timeout errors - logger.error("Timeout connecting to %s", url) - yield b'{"error": "Service timeout"}' + PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL) + DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL) + KV_TARGET = _get_host_port(DECODE_SERVICE_URL) + + def _build_headers(request_id: str) -> dict[str, str]: + """Construct the headers expected by vLLM's P2P disagg connector.""" + headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET} + api_key = os.environ.get("OPENAI_API_KEY") + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + return headers + + async def _run_prefill( + request_path: str, + payload: dict, + headers: dict[str, str], + request_id: str, + ): + url = f"{PREFILL_BASE}{request_path}" + start_ts = time.perf_counter() + logger.info("[prefill] start request_id=%s url=%s", request_id, url) + try: + async with ( + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + session.post(url=url, json=payload, headers=headers) as resp, + ): + if resp.status != 200: + error_text = await resp.text() + raise RuntimeError( + f"Prefill backend error {resp.status}: {error_text}" + ) + await resp.read() + logger.info( + "[prefill] done request_id=%s status=%s elapsed=%.2fs", + request_id, + resp.status, + time.perf_counter() - start_ts, + ) + except asyncio.TimeoutError as exc: + raise RuntimeError(f"Prefill service timeout at {url}") from exc + except aiohttp.ClientError as exc: + raise RuntimeError(f"Prefill service unavailable at {url}") from exc + + async def _stream_decode( + request_path: str, + payload: dict, + headers: dict[str, str], + request_id: str, + ): + url = f"{DECODE_BASE}{request_path}" + # Stream tokens from the decode service once the prefill stage has + # materialized KV caches on the target workers. + logger.info("[decode] start request_id=%s url=%s", request_id, url) + try: + async with ( + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + session.post(url=url, json=payload, headers=headers) as resp, + ): + if resp.status != 200: + error_text = await resp.text() + logger.error( + "Decode backend error %s - %s", resp.status, error_text + ) + err_msg = ( + '{"error": "Decode backend error ' + str(resp.status) + '"}' + ) + yield err_msg.encode() + return + logger.info( + "[decode] streaming response request_id=%s status=%s", + request_id, + resp.status, + ) + async for chunk_bytes in resp.content.iter_chunked(1024): + yield chunk_bytes + logger.info("[decode] finished streaming request_id=%s", request_id) + except asyncio.TimeoutError: + logger.error("Decode service timeout at %s", url) + yield b'{"error": "Decode service timeout"}' + except aiohttp.ClientError as exc: + logger.error("Decode service error at %s: %s", url, exc) + yield b'{"error": "Decode service unavailable"}' async def process_request(): """Process a single request through prefill and decode stages""" @@ -146,13 +206,27 @@ def main(): # Create prefill request (max_tokens=1) prefill_request = original_request_data.copy() prefill_request["max_tokens"] = 1 + if "max_completion_tokens" in prefill_request: + prefill_request["max_completion_tokens"] = 1 # Execute prefill stage - async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request): - continue + # The request id encodes both KV socket addresses so the backend can + # shuttle tensors directly via NCCL once the prefill response + # completes. + request_id = ( + f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_" + f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}" + ) + + headers = _build_headers(request_id) + await _run_prefill(request.path, prefill_request, headers, request_id) # Execute decode stage and stream response - generator = forward_request(DECODE_SERVICE_URL, original_request_data) + # Pass the unmodified user request so the decode phase can continue + # sampling with the already-populated KV cache. + generator = _stream_decode( + request.path, original_request_data, headers, request_id + ) response = await make_response(generator) response.timeout = None # Disable timeout for streaming response return response @@ -168,23 +242,10 @@ def main(): @app.route("/v1/completions", methods=["POST"]) async def handle_request(): """Handle incoming API requests with concurrency and rate limiting""" - # Create task for request processing - task = asyncio.create_task(process_request()) - - # Enqueue request or reject if queue is full - if not await request_queue.enqueue(task): - return Response( - response=b'{"error": "Server busy, try again later"}', - status=503, - content_type="application/json", - ) - try: - # Return the response from the processing task - return await task + return await process_request() except asyncio.CancelledError: - # Handle task cancellation (timeout or queue full) - logger.warning("Request cancelled due to timeout or queue full") + logger.warning("Request cancelled") return Response( response=b'{"error": "Request cancelled"}', status=503, diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index d809bf1db8cbc..fb3329975cee3 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -14,6 +14,9 @@ from tqdm import tqdm import vllm._custom_ops as ops from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8, +) @dataclass @@ -22,6 +25,7 @@ class bench_params_t: hidden_size: int add_residual: bool dtype: torch.dtype + group_size: list[int] def description(self): return ( @@ -29,6 +33,7 @@ class bench_params_t: f"x D {self.hidden_size} " f"x R {self.add_residual} " f"x DT {self.dtype}" + f"x GS {self.group_size}" ) @@ -38,10 +43,11 @@ def get_bench_params() -> list[bench_params_t]: HIDDEN_SIZES = list(range(1024, 8129, 1024)) ADD_RESIDUAL = [True, False] DTYPES = [torch.bfloat16, torch.float] + GROUP_SIZES = [[1, 64], [1, 128]] - combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES) + combinations = product(NUM_TOKENS, HIDDEN_SIZES, ADD_RESIDUAL, DTYPES, GROUP_SIZES) bench_params = list( - map(lambda x: bench_params_t(x[0], x[1], x[2], x[3]), combinations) + map(lambda x: bench_params_t(x[0], x[1], x[2], x[3], x[4]), combinations) ) return bench_params @@ -52,6 +58,7 @@ def unfused_int8_impl( x: torch.Tensor, residual: torch.Tensor | None, quant_dtype: torch.dtype, + group_size: list[int], ): # Norm torch_out = None @@ -69,6 +76,7 @@ def unfused_fp8_impl( x: torch.Tensor, residual: torch.Tensor | None, quant_dtype: torch.dtype, + group_size: list[int], ): # Norm torch_out = None @@ -81,23 +89,63 @@ def unfused_fp8_impl( torch_out, _ = ops.scaled_fp8_quant(torch_out) +def unfused_groupwise_fp8_impl( + rms_norm_layer: RMSNorm, + x: torch.Tensor, + residual: torch.Tensor | None, + quant_dtype: torch.dtype, + group_size: list[int], +): + # Norm + torch_out = None + if residual is None: + torch_out = rms_norm_layer.forward_cuda(x, residual) + else: + torch_out, _ = rms_norm_layer.forward_cuda(x, residual) + + # Quant + torch_out, _ = per_token_group_quant_fp8( + torch_out, group_size=group_size[1], use_ue8m0=False + ) + + def fused_impl( rms_norm_layer: RMSNorm, # this stores the weights x: torch.Tensor, residual: torch.Tensor | None, quant_dtype: torch.dtype, + group_size: list[int], ): out, _ = ops.rms_norm_dynamic_per_token_quant( x, rms_norm_layer.weight, 1e-6, quant_dtype, residual=residual ) +def fused_groupwise_impl( + rms_norm_layer: RMSNorm, # this stores the weights + x: torch.Tensor, + residual: torch.Tensor | None, + quant_dtype: torch.dtype, + group_size: list[int], +): + out, _ = ops.rms_norm_per_block_quant( + x, + rms_norm_layer.weight, + 1e-6, + quant_dtype, + group_size, + residual=residual, + is_scale_transposed=True, + ) + + # Bench functions def bench_fn( rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor, quant_dtype: torch.dtype, + group_size: list[int], label: str, sub_label: str, fn: Callable, @@ -110,10 +158,11 @@ def bench_fn( "x": x, "residual": residual, "quant_dtype": quant_dtype, + "group_size": group_size, "fn": fn, } return TBenchmark.Timer( - stmt="fn(rms_norm_layer, x, residual, quant_dtype)", + stmt="fn(rms_norm_layer, x, residual, quant_dtype, group_size)", globals=globals, label=label, sub_label=sub_label, @@ -147,6 +196,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu x, residual, torch.int8, + params.group_size, label, sub_label, unfused_int8_impl, @@ -161,6 +211,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu x, residual, torch.float8_e4m3fn, + params.group_size, label, sub_label, unfused_fp8_impl, @@ -175,6 +226,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu x, residual, torch.int8, + params.group_size, label, sub_label, fused_impl, @@ -189,6 +241,7 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu x, residual, torch.float8_e4m3fn, + params.group_size, label, sub_label, fused_impl, @@ -196,6 +249,36 @@ def bench(params: bench_params_t, label: str, sub_label: str) -> Iterable[TMeasu ) ) + # unfused groupwise fp8 impl. + timers.append( + bench_fn( + layer, + x, + residual, + torch.float8_e4m3fn, + params.group_size, + label, + sub_label, + unfused_groupwise_fp8_impl, + "unfused_groupwise_fp8_impl", + ) + ) + + # fused groupwise fp8 impl. + timers.append( + bench_fn( + layer, + x, + residual, + torch.float8_e4m3fn, + params.group_size, + label, + sub_label, + fused_groupwise_impl, + "fused_groupwise_fp8_impl", + ) + ) + print_timers(timers) return timers diff --git a/benchmarks/kernels/bench_nvfp4_quant.py b/benchmarks/kernels/bench_nvfp4_quant.py new file mode 100644 index 0000000000000..7517376535925 --- /dev/null +++ b/benchmarks/kernels/bench_nvfp4_quant.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import copy +import itertools + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types +from vllm.triton_utils import triton +from vllm.utils.flashinfer import flashinfer_fp4_quantize + +if not current_platform.has_device_capability(100): + raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)") + +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() +FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max + +PROVIDER_CFGS = { + "vllm": dict(backend="vllm", enabled=True), + "flashinfer": dict(backend="flashinfer", enabled=True), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor: + """Compute global scale for FP4 quantization.""" + amax = torch.abs(tensor).max().to(torch.float32) + return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=_enabled, + ylabel="us (lower is better)", + plot_name="NVFP4 Input Quantization Latency (us)", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + # Create input tensor + a = torch.randn((M, K), device=device, dtype=dtype) + + # Compute global scale for activation + a_global_scale = compute_global_scale(a) + + quantiles = [0.5, 0.2, 0.8] + + cfg = PROVIDER_CFGS[provider] + + if cfg["backend"] == "vllm": + # vLLM's FP4 quantization + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: ops.scaled_fp4_quant(a, a_global_scale), + quantiles=quantiles, + ) + elif cfg["backend"] == "flashinfer": + # FlashInfer's FP4 quantization + # Use is_sf_swizzled_layout=True to match vLLM's output format + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: flashinfer_fp4_quantize( + a, a_global_scale, is_sf_swizzled_layout=True + ), + quantiles=quantiles, + ) + + # Convert ms to us for better readability at small batch sizes + to_us = lambda t_ms: t_ms * 1000 + return to_us(ms), to_us(max_ms), to_us(min_ms) + + +def prepare_shapes(args): + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + out.append(KN) + return out + + +def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str): + """Test accuracy between vLLM and FlashInfer FP4 quantization.""" + # Create input tensor + a = torch.randn((M, K), device=device, dtype=dtype) + + # Compute global scale + a_global_scale = compute_global_scale(a) + + # vLLM quantization + vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale) + + # FlashInfer quantization (with swizzled layout to match vLLM's output) + flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize( + a, a_global_scale, is_sf_swizzled_layout=True + ) + flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn) + + # Compare outputs + torch.testing.assert_close( + vllm_fp4, + flashinfer_fp4, + ) + print(f"M={M}, K={K}, dtype={dtype}: PASSED") + + +def test_accuracy(): + """Run accuracy tests across various shapes.""" + print("\n" + "=" * 60) + print("Running accuracy tests: vLLM vs FlashInfer") + print("=" * 60) + + device = "cuda" + dtype = torch.bfloat16 + + # Test various batch sizes and hidden dimensions + Ms = [1, 1024] + Ks = [4096] + + for M in Ms: + for K in Ks: + _test_accuracy_once(M, K, dtype, device) + + print("\nAll accuracy tests passed!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark NVFP4 quantization: vLLM vs FlashInfer" + ) + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) + parser.add_argument( + "--save-path", + type=str, + default=None, + help="Path to save benchmark results", + ) + parser.add_argument( + "--accuracy", + action="store_true", + help="Run accuracy tests", + ) + args = parser.parse_args() + + if args.accuracy: + test_accuracy() + + for K, N, model in prepare_shapes(args): + print(f"\n{model}, N={N} K={K}") + benchmark.run( + print_data=True, + save_path=args.save_path, + N=N, + K=K, + ) + + print("\nBenchmark finished!") diff --git a/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py new file mode 100644 index 0000000000000..04921dafbdbea --- /dev/null +++ b/benchmarks/kernels/benchmark_2d_silu_mul_fp8_quant.py @@ -0,0 +1,244 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from enum import Enum +from itertools import product +from typing import Any + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement + +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + _per_token_group_quant_fp8_colmajor, + silu_mul_per_token_group_quant_fp8_colmajor, +) +from vllm.triton_utils import triton +from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used + +from .utils import ArgPool, Bench, CudaGraphBenchParams + +GROUP_SIZE = 128 +FLOAT8_T = torch.float8_e4m3fn + + +def print_timers(timers: list[TMeasurement], cuda_graph_nops: int): + print( + f"Note : The timings reported above is for {cuda_graph_nops} " + "consecutive invocations of the benchmarking functions. " + f"Please divide by {cuda_graph_nops} for single invocation " + "timings." + ) + compare = TBenchmark.Compare(timers) + compare.print() + + +class ImplType(Enum): + SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR = 1 + REFERENCE = 2 + + def get_impl(self): + if self == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR: + return silu_mul_per_token_group_quant_fp8_colmajor + elif self == ImplType.REFERENCE: + return reference + raise ValueError(f"Unrecognized ImplType {self}") + + +@dataclass +class BenchmarkTensors: + input: torch.Tensor + output: torch.Tensor + + # Reference act output tensor + ref_act_out: torch.Tensor + ref_quant_out: torch.Tensor + + @staticmethod + def make(T: int, N: int) -> "BenchmarkTensors": + assert T % GROUP_SIZE == 0 + assert N % (GROUP_SIZE * 2) == 0 + + input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda") + + # silu_mul_per_token_group_quant_fp8_colmajor output. + output = torch.rand((T, N // 2), dtype=torch.bfloat16, device="cuda").to( + FLOAT8_T + ) + + # reference output. + ref_act_out = torch.empty((T, N // 2), dtype=torch.bfloat16, device="cuda") + ref_quant_out = torch.empty( + (T, N // 2), dtype=torch.bfloat16, device="cuda" + ).to(FLOAT8_T) + + return BenchmarkTensors( + input=input, + output=output, + ref_act_out=ref_act_out, + ref_quant_out=ref_quant_out, + ) + + @property + def T(self): + return self.input.size(0) + + @property + def N(self): + return self.input.size(1) + + def make_impl_kwargs(self, impl_type: ImplType) -> dict[str, Any]: + if impl_type == ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR: + return { + "input": self.input, + "output": self.output, + "use_ue8m0": is_deep_gemm_e8m0_used(), + } + elif impl_type == ImplType.REFERENCE: + return { + "input": self.input, + "act_out": self.ref_act_out, + "quant_out": self.ref_quant_out, + "use_ue8m0": is_deep_gemm_e8m0_used(), + } + raise ValueError(f"Unrecognized impl_type {impl_type}") + + +def reference_quant(x: torch.Tensor, quant_out: torch.Tensor, use_ue8m0: bool): + """ + Reference triton quant kernel from, + vllm.model_executor.layers.quantization.utils.fp8_utils + """ + assert quant_out.size() == x.size() + # Allocate the scale tensor column-major format. + shape = (x.shape[-1] // GROUP_SIZE,) + x.shape[:-1] + x_q = quant_out + x_s = torch.empty(shape, device=x.device, dtype=torch.float32).permute(-1, -2) + + M = x.numel() // GROUP_SIZE + N = GROUP_SIZE + BLOCK = triton.next_power_of_2(N) + # heuristics for number of warps + num_warps = min(max(BLOCK // 256, 1), 8) + num_stages = 1 + + finfo = torch.finfo(FLOAT8_T) + fp8_min = finfo.min + fp8_max = finfo.max + + _per_token_group_quant_fp8_colmajor[(M,)]( + x, + x_q, + x_s, + GROUP_SIZE, + x.shape[1], + x.stride(0), + x_s.stride(1), + eps=1e-10, + fp8_min=fp8_min, + fp8_max=fp8_max, + use_ue8m0=use_ue8m0, + BLOCK=BLOCK, + num_warps=num_warps, + num_stages=num_stages, + ) + return x_q, x_s + + +def reference( + input: torch.Tensor, + act_out: torch.Tensor, + quant_out: torch.Tensor, + use_ue8m0: bool, +) -> tuple[torch.Tensor, torch.Tensor]: + torch.ops._C.silu_and_mul(act_out, input) + return reference_quant(act_out, quant_out, use_ue8m0) + + +def bench_impl( + bench_tensors: list[BenchmarkTensors], impl_type: ImplType +) -> TMeasurement: + T = bench_tensors[0].T + N = bench_tensors[0].N + + arg_pool_size = len(bench_tensors) + kwargs_list = [bt.make_impl_kwargs(impl_type) for bt in bench_tensors] + + # warmup + for kwargs in kwargs_list: + impl_type.get_impl()(**kwargs) + torch.cuda.synchronize() + + # Merge into a single kwargs and qualify arguments as ArgPool + kwargs = {k: ArgPool([]) for k in kwargs_list[0]} + for _kwargs in kwargs_list: + for k, v in _kwargs.items(): + kwargs[k].values.append(v) + + cuda_graph_params = None + cuda_graph_params = CudaGraphBenchParams(arg_pool_size) + timer = None + with Bench( + cuda_graph_params, + "silu-mul-quant", + f"num_tokens={T}, N={N}", + impl_type.name, + impl_type.get_impl(), + **kwargs, + ) as bench: + timer = bench.run() + return timer + + +def test_correctness(T: int, N: int): + print(f"Testing num_tokens={T}, N={N} ...") + + bench_tensor = BenchmarkTensors.make(T, N) + + def output_from_impl(impl: ImplType) -> tuple[torch.Tensor, torch.Tensor]: + return impl.get_impl()(**bench_tensor.make_impl_kwargs(impl)) + + # reference output + ref_out_q, ref_out_s = output_from_impl(ImplType.REFERENCE) + + # test ouptut + out_q, out_s = output_from_impl( + ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR + ) + + torch.testing.assert_close(ref_out_q.to(torch.float32), out_q.to(torch.float32)) + torch.testing.assert_close(ref_out_s, out_s) + + +def run(Ts: list[int], Ns: list[int], arg_pool_size: int) -> list[TMeasurement]: + timers = [] + for N, T in product(Ns, Ts): + test_correctness(T, N) + + bench_tensors: list[BenchmarkTensors] = [ + BenchmarkTensors.make(T, N) for _ in range(arg_pool_size) + ] + + silu_mul_quant_timer = bench_impl( + bench_tensors, ImplType.SILU_MUL_PER_TOKEN_GROUP_QUANT_FP8_COLMAJOR + ) + timers.append(silu_mul_quant_timer) + reference_timer = bench_impl(bench_tensors, ImplType.REFERENCE) + timers.append(reference_timer) + + print_timers( + [silu_mul_quant_timer, reference_timer], cuda_graph_nops=arg_pool_size + ) + + print_timers(timers, cuda_graph_nops=arg_pool_size) + + return timers + + +if __name__ == "__main__": + T = [128 * i for i in range(1, 16)] + [2048 * i for i in range(1, 65)] + N = [2048, 4096, 8192] + + print(f"T = {T}, N = {N}") + run(T, N, arg_pool_size=8) diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py index 66268b71b3de6..d31e67057d8f6 100644 --- a/benchmarks/kernels/benchmark_activation.py +++ b/benchmarks/kernels/benchmark_activation.py @@ -13,8 +13,8 @@ from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE -batch_size_range = [1, 16, 32, 64, 128] -seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] +batch_size_range = [1, 16, 128] +seq_len_range = [1, 16, 64, 1024, 4096] intermediate_size = [3072, 9728, 12288] configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 8787724d77cfb..ac78c019a59e5 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: b_q_weight=w_q, b_bias=None, b_scales=w_s, + a_scales=None, global_scale=None, b_zeros=w_zp, g_idx=g_idx, diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 12ca9214b1f95..48d790aec9e07 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -263,7 +263,7 @@ def bench_run( results.append( benchmark.Timer( - stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 + stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, @@ -273,7 +273,7 @@ def bench_run( results.append( benchmark.Timer( - stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 + stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py new file mode 100644 index 0000000000000..fb3b6c8f12003 --- /dev/null +++ b/benchmarks/kernels/benchmark_mla_k_concat.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation +in MLA (Multi-head Latent Attention) prefill. + +This validates that the optimization from commit 8d4142bd is beneficial across +various batch sizes, not just the originally tested batch size of 32768. +""" + +import time +from collections.abc import Callable + +import torch + +# DeepSeek-V3 MLA dimensions +NUM_HEADS = 128 +QK_NOPE_HEAD_DIM = 128 +PE_DIM = 64 + + +def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor: + """Original torch.cat approach with expand.""" + return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) + + +def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor: + """Optimized direct copy approach (avoids expand + cat overhead).""" + k = torch.empty( + (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]), + dtype=k_nope.dtype, + device=k_nope.device, + ) + k[..., : k_nope.shape[-1]] = k_nope + k[..., k_nope.shape[-1] :] = k_pe + return k + + +def benchmark_method( + method: Callable, + k_nope: torch.Tensor, + k_pe: torch.Tensor, + num_warmup: int = 10, + num_iters: int = 100, +) -> float: + """Benchmark a concatenation method and return mean latency in ms.""" + # Warmup + for _ in range(num_warmup): + _ = method(k_nope, k_pe) + torch.cuda.synchronize() + + # Benchmark + start = time.perf_counter() + for _ in range(num_iters): + _ = method(k_nope, k_pe) + torch.cuda.synchronize() + end = time.perf_counter() + + return (end - start) / num_iters * 1000 # Convert to ms + + +@torch.inference_mode() +def run_benchmark(dtype: torch.dtype, dtype_name: str): + """Run benchmark for a specific dtype.""" + torch.set_default_device("cuda") + + # Batch sizes to test (powers of 2 from 32 to 65536) + batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536] + + print("=" * 80) + print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation") + print("=" * 80) + print( + f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], " + f"k_pe=[B, 1, {PE_DIM}]" + ) + print(f"dtype: {dtype_name}") + print() + print( + f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | " + f"{'Speedup':>8} | {'Reduction':>10}" + ) + print("-" * 70) + + results = [] + for batch_size in batch_sizes: + # Create input tensors (generate in float32 then convert for FP8 compatibility) + k_nope = torch.randn( + batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda" + ).to(dtype) + k_pe = torch.randn( + batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda" + ).to(dtype) + + # Benchmark both methods + cat_time = benchmark_method(cat_method, k_nope, k_pe) + direct_time = benchmark_method(direct_copy_method, k_nope, k_pe) + + speedup = cat_time / direct_time + reduction = (1 - direct_time / cat_time) * 100 + + results.append((batch_size, cat_time, direct_time, speedup, reduction)) + + print( + f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | " + f"{speedup:>7.2f}x | {reduction:>9.1f}%" + ) + + print("=" * 80) + + # Summary statistics + speedups = [r[3] for r in results] + print("\nSpeedup summary:") + print(f" Min: {min(speedups):.2f}x") + print(f" Max: {max(speedups):.2f}x") + print(f" Mean: {sum(speedups) / len(speedups):.2f}x") + + # Find crossover point + crossover_batch = None + for batch_size, _, _, speedup, _ in results: + if speedup >= 1.0: + crossover_batch = batch_size + break + + print("\nConclusion:") + if crossover_batch: + print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}") + # Filter for large batches (>= 512 which is typical for prefill) + large_batch_speedups = [r[3] for r in results if r[0] >= 512] + if large_batch_speedups: + avg_large = sum(large_batch_speedups) / len(large_batch_speedups) + print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x") + print(" - MLA prefill typically uses large batches, so optimization is effective") + + return results + + +@torch.inference_mode() +def main(): + # Test bfloat16 + print("\n") + run_benchmark(torch.bfloat16, "bfloat16") + + # Test float8_e4m3fn + print("\n") + run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/kernels/benchmark_moe_align_block_size.py b/benchmarks/kernels/benchmark_moe_align_block_size.py index f540cff6261a8..5f9a131f79b0e 100644 --- a/benchmarks/kernels/benchmark_moe_align_block_size.py +++ b/benchmarks/kernels/benchmark_moe_align_block_size.py @@ -24,12 +24,15 @@ def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor: num_tokens_range = [1, 16, 256, 4096] num_experts_range = [16, 64, 224, 256, 280, 512] topk_range = [1, 2, 8] -configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range)) +ep_size_range = [1, 8] +configs = list( + itertools.product(num_tokens_range, num_experts_range, topk_range, ep_size_range) +) @triton.testing.perf_report( triton.testing.Benchmark( - x_names=["num_tokens", "num_experts", "topk"], + x_names=["num_tokens", "num_experts", "topk", "ep_size"], x_vals=configs, line_arg="provider", line_vals=["vllm"], @@ -38,16 +41,26 @@ configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range args={}, ) ) -def benchmark(num_tokens, num_experts, topk, provider): +def benchmark(num_tokens, num_experts, topk, ep_size, provider): """Benchmark function for Triton.""" block_size = 256 + torch.cuda.manual_seed_all(0) topk_ids = get_topk_ids(num_tokens, num_experts, topk) + e_map = None + if ep_size != 1: + local_e = num_experts // ep_size + e_ids = torch.randperm(num_experts, device="cuda", dtype=torch.int32)[:local_e] + e_map = torch.full((num_experts,), -1, device="cuda", dtype=torch.int32) + e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32) + quantiles = [0.5, 0.2, 0.8] if provider == "vllm": ms, min_ms, max_ms = triton.testing.do_bench( - lambda: moe_align_block_size(topk_ids, block_size, num_experts), + lambda: moe_align_block_size( + topk_ids, block_size, num_experts, e_map, ignore_invalid_experts=True + ), quantiles=quantiles, ) diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index 83bd91917508f..09de5fa822f86 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -99,7 +99,6 @@ def benchmark_mrope( # the parameters to compute the q k v size based on tp_size mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=head_dim, max_position=max_position, is_neox_style=is_neox_style, rope_parameters=rope_parameters, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 074b7a440b612..7a1bc050bb33f 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device): def benchmark(batch_size, seq_len, num_heads, provider): dtype = torch.bfloat16 max_position = 8192 - base = 10000 - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style) + rope_parameters = {"partial_rotary_factor": rotary_dim / head_size} + rope = get_rope(head_size, max_position, is_neox_style, rope_parameters) rope = rope.to(dtype=dtype, device=device) cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device) diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md index 41e68e047be82..a28c6956be0e9 100644 --- a/benchmarks/kernels/deepgemm/README.md +++ b/benchmarks/kernels/deepgemm/README.md @@ -2,7 +2,7 @@ This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels. -Currently this just includes dense GEMMs and only works on Hopper GPUs. +Currently, this just includes dense GEMMs and only works on Hopper GPUs. ## Setup diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index fbbb03c5ed465..0af87fd7f0b53 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON endif() # Build ACL with CMake - set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF") - set(CMAKE_BUILD_TYPE "Release") - set(ARM_COMPUTE_ARCH "armv8.2-a") - set(ARM_COMPUTE_ENABLE_ASSERTS "OFF") - set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF") - set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER") - set(ARM_COMPUTE_ENABLE_OPENMP "ON") - set(ARM_COMPUTE_ENABLE_WERROR "OFF") - set(ARM_COMPUTE_BUILD_EXAMPLES "OFF") - set(ARM_COMPUTE_BUILD_TESTING "OFF") - set(_cmake_config_cmd ${CMAKE_COMMAND} -G Ninja -B build -DARM_COMPUTE_BUILD_SHARED_LIB=OFF @@ -341,7 +330,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON PUBLIC ${oneDNN_BINARY_DIR}/include PRIVATE ${oneDNN_SOURCE_DIR}/src ) - target_link_libraries(dnnl_ext dnnl) + target_link_libraries(dnnl_ext dnnl torch) target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC) list(APPEND LIBS dnnl_ext) set(USE_ONEDNN ON) @@ -369,13 +358,13 @@ set(VLLM_EXT_SRC "csrc/cpu/pos_encoding.cpp" "csrc/moe/dynamic_4bit_int_moe_cpu.cpp" "csrc/cpu/cpu_attn.cpp" - "csrc/cpu/scratchpad_manager.cpp" "csrc/cpu/torch_bindings.cpp") if (AVX512_FOUND AND NOT AVX512_DISABLED) set(VLLM_EXT_SRC "csrc/cpu/shm.cpp" "csrc/cpu/cpu_wna16.cpp" + "csrc/cpu/cpu_fused_moe.cpp" ${VLLM_EXT_SRC}) if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI) set(VLLM_EXT_SRC diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake index 2cf3c1a755d3c..0d4f9b7aa07c8 100644 --- a/cmake/external_projects/flashmla.cmake +++ b/cmake/external_projects/flashmla.cmake @@ -35,16 +35,21 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}") # sm90a set(SUPPORT_ARCHS) -if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3) - list(APPEND SUPPORT_ARCHS 9.0a) +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3) + list(APPEND SUPPORT_ARCHS "9.0a") endif() -if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8) - list(APPEND SUPPORT_ARCHS 10.0a) +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9) + # CUDA 12.9 has introduced "Family-Specific Architecture Features" + # this supports all compute_10x family + list(APPEND SUPPORT_ARCHS "10.0f") +elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8) + list(APPEND SUPPORT_ARCHS "10.0a") endif() cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}") if(FLASH_MLA_ARCHS) + message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}") set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS}) list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math") @@ -126,7 +131,8 @@ if(FLASH_MLA_ARCHS) $<$:-UPy_LIMITED_API> $<$:-UPy_LIMITED_API>) else() - # Create empty targets for setup.py when not targeting sm90a systems + message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}") + # Create empty targets for setup.py on unsupported systems add_custom_target(_flashmla_C) add_custom_target(_flashmla_extension_C) endif() diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 6cc5cda14c525..ff687e0af7b44 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 71bb26f6295449be880344b93b51791cc009237d + GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/cmake/utils.cmake b/cmake/utils.cmake index ca0062ba4fabe..bdb2ba74d944d 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR) run_python(_VLLM_TORCH_GOMP_PATH " import os, glob -try: - import torch - torch_pkg = os.path.dirname(torch.__file__) - site_root = os.path.dirname(torch_pkg) - torch_libs = os.path.join(site_root, 'torch.libs') - print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0]) -except: - print('') +import torch +torch_pkg = os.path.dirname(torch.__file__) +site_root = os.path.dirname(torch_pkg) + +# Search both torch.libs and torch/lib +roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')] +candidates = [] +for root in roots: + if not os.path.isdir(root): + continue + candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*'))) + +print(candidates[0] if candidates else '') " - "failed to probe torch.libs for libgomp") + "failed to probe for libgomp") if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}") return() @@ -495,7 +500,13 @@ function (define_extension_target MOD_NAME) set(SOABI_KEYWORD "") endif() - if (ARG_USE_SABI) + run_python(IS_FREETHREADED_PYTHON + "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)" + "Failed to determine whether interpreter is free-threaded") + + # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809), + # so avoid using the stable ABI under free-threading only. + if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON) Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}") else() Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}") diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index a4a880f13cf7e..8268065ef02c8 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -15,19 +15,61 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x, const scalar_t& y) { return act_first ? ACT_FN(x) * y : x * ACT_FN(y); } -// Activation and gating kernel template. +// Check if all pointers are 16-byte aligned for int4 vectorized access +__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) { + return (reinterpret_cast(ptr) & 15) == 0; +} + +// Activation and gating kernel template. template __global__ void act_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., 2, d] const int d) { + constexpr int VEC_SIZE = 16 / sizeof(scalar_t); const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); - const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); - out[token_idx * d + idx] = compute(x, y); + const scalar_t* x_ptr = input + token_idx * 2 * d; + const scalar_t* y_ptr = x_ptr + d; + scalar_t* out_ptr = out + token_idx * d; + + // Check alignment for 128-bit vectorized access. + // All three pointers must be 16-byte aligned for safe int4 operations. + const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) && + is_16byte_aligned(out_ptr); + + if (aligned && d >= VEC_SIZE) { + // Fast path: 128-bit vectorized loop + const int4* x_vec = reinterpret_cast(x_ptr); + const int4* y_vec = reinterpret_cast(y_ptr); + int4* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / VEC_SIZE; + const int vec_end = num_vecs * VEC_SIZE; + + for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { + int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r; + auto* xp = reinterpret_cast(&x); + auto* yp = reinterpret_cast(&y); + auto* rp = reinterpret_cast(&r); +#pragma unroll + for (int j = 0; j < VEC_SIZE; j++) { + rp[j] = compute(xp[j], yp[j]); + } + out_vec[i] = r; + } + // Scalar cleanup for remaining elements + for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { + out_ptr[i] = compute(VLLM_LDG(&x_ptr[i]), + VLLM_LDG(&y_ptr[i])); + } + } else { + // Scalar fallback for unaligned data or small d + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + const scalar_t x = VLLM_LDG(&x_ptr[idx]); + const scalar_t y = VLLM_LDG(&y_ptr[idx]); + out_ptr[idx] = compute(x, y); + } } } @@ -120,50 +162,115 @@ template __global__ void act_and_mul_kernel_with_param( scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d, const float param) { + constexpr int VEC_SIZE = 16 / sizeof(scalar_t); const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); - const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); - out[token_idx * d + idx] = ACT_FN(x, param) * y; + const scalar_t* x_ptr = input + token_idx * 2 * d; + const scalar_t* y_ptr = x_ptr + d; + scalar_t* out_ptr = out + token_idx * d; + + // Check alignment for 128-bit vectorized access + const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) && + is_16byte_aligned(out_ptr); + + if (aligned && d >= VEC_SIZE) { + // Fast path: 128-bit vectorized loop + const int4* x_vec = reinterpret_cast(x_ptr); + const int4* y_vec = reinterpret_cast(y_ptr); + int4* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / VEC_SIZE; + const int vec_end = num_vecs * VEC_SIZE; + + for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { + int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r; + auto* xp = reinterpret_cast(&x); + auto* yp = reinterpret_cast(&y); + auto* rp = reinterpret_cast(&r); +#pragma unroll + for (int j = 0; j < VEC_SIZE; j++) { + rp[j] = ACT_FN(xp[j], param) * yp[j]; + } + out_vec[i] = r; + } + // Scalar cleanup for remaining elements + for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { + out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]); + } + } else { + // Scalar fallback for unaligned data or small d + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + const scalar_t x = VLLM_LDG(&x_ptr[idx]); + const scalar_t y = VLLM_LDG(&y_ptr[idx]); + out_ptr[idx] = ACT_FN(x, param) * y; + } } } template __device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up, float alpha, float limit) { - // clamp gate: min=None, max=limit - const float gate_f = (float)gate; - const float clamped_gate = gate_f > limit ? limit : gate_f; - - // clamp up: min=-limit, max=limit - const float up_f = (float)up; - const float clamped_up = - up_f > limit ? limit : (up_f < -limit ? -limit : up_f); - - // glu = gate * sigmoid(gate * alpha) - const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha)); - const float glu = clamped_gate * sigmoid_val; - - // (up + 1) * glu - return (T)((clamped_up + 1.0f) * glu); + // Clamp gate to (-inf, limit] and up to [-limit, limit] + const float g = fminf((float)gate, limit); + const float u = fmaxf(fminf((float)up, limit), -limit); + // glu = gate * sigmoid(gate * alpha), then return (up + 1) * glu + return (T)((u + 1.0f) * g / (1.0f + expf(-g * alpha))); } +// Interleaved gate/up: input has [gate0, up0, gate1, up1, ...]. template __global__ void swigluoai_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., 2, d] + const scalar_t* __restrict__ input, // [..., 2 * d] (interleaved) const int d, const float alpha, const float limit) { + // For interleaved data: input has 2*d elements per token (gate/up pairs) + // output has d elements per token + constexpr int VEC_SIZE = 16 / sizeof(scalar_t); + constexpr int PAIRS = VEC_SIZE / 2; // Number of gate/up pairs per int4 load const int64_t token_idx = blockIdx.x; - // TODO: Vectorize loads and stores. - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - // gate = x[..., ::2] (even indices) - const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]); - // up = x[..., 1::2] (odd indices) - const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]); + const scalar_t* in_ptr = input + token_idx * 2 * d; + scalar_t* out_ptr = out + token_idx * d; - out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit); + // Check alignment for 128-bit vectorized access on input. + // For output we use int2 (64-bit) which has 8-byte alignment requirement. + const bool in_aligned = is_16byte_aligned(in_ptr); + const bool out_aligned = + (reinterpret_cast(out_ptr) & 7) == 0; // 8-byte for int2 + + if (in_aligned && out_aligned && d >= PAIRS) { + // Fast path: vectorized loop + // Each int4 load gives VEC_SIZE elements = PAIRS gate/up pairs + // Each int2 store writes PAIRS output elements + const int4* in_vec = reinterpret_cast(in_ptr); + int2* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / PAIRS; + const int vec_end = num_vecs * PAIRS; + + for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { + int4 v = VLLM_LDG(&in_vec[i]); + int2 r; + auto* vp = reinterpret_cast(&v); + auto* rp = reinterpret_cast(&r); +#pragma unroll + for (int j = 0; j < PAIRS; j++) { + rp[j] = ACT_FN(vp[2 * j], vp[2 * j + 1], alpha, limit); + } + out_vec[i] = r; + } + // Scalar cleanup for remaining elements + for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { + out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[2 * i]), + VLLM_LDG(&in_ptr[2 * i + 1]), alpha, limit); + } + } else { + // Scalar fallback for unaligned data or small d + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + // gate = x[..., ::2] (even indices) + const scalar_t gate = VLLM_LDG(&in_ptr[2 * idx]); + // up = x[..., 1::2] (odd indices) + const scalar_t up = VLLM_LDG(&in_ptr[2 * idx + 1]); + out_ptr[idx] = ACT_FN(gate, up, alpha, limit); + } } } @@ -217,10 +324,41 @@ __global__ void activation_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., d] const int d) { + constexpr int VEC_SIZE = 16 / sizeof(scalar_t); const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]); - out[token_idx * d + idx] = ACT_FN(x); + const scalar_t* in_ptr = input + token_idx * d; + scalar_t* out_ptr = out + token_idx * d; + + // Check alignment for 128-bit vectorized access + const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr); + + if (aligned && d >= VEC_SIZE) { + // Fast path: 128-bit vectorized loop + const int4* in_vec = reinterpret_cast(in_ptr); + int4* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / VEC_SIZE; + const int vec_end = num_vecs * VEC_SIZE; + + for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { + int4 v = VLLM_LDG(&in_vec[i]), r; + auto* vp = reinterpret_cast(&v); + auto* rp = reinterpret_cast(&r); +#pragma unroll + for (int j = 0; j < VEC_SIZE; j++) { + rp[j] = ACT_FN(vp[j]); + } + out_vec[i] = r; + } + // Scalar cleanup for remaining elements + for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { + out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i])); + } + } else { + // Scalar fallback for unaligned data or small d + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + const scalar_t x = VLLM_LDG(&in_ptr[idx]); + out_ptr[idx] = ACT_FN(x); + } } } diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu index 229d9862fb670..27d1e990c611e 100644 --- a/csrc/attention/merge_attn_states.cu +++ b/csrc/attention/merge_attn_states.cu @@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel( scalar_t* output, float* output_lse, const scalar_t* prefix_output, const float* prefix_lse, const scalar_t* suffix_output, const float* suffix_lse, const uint num_tokens, const uint num_heads, - const uint head_size) { + const uint head_size, const uint prefix_head_stride, + const uint output_head_stride) { using pack_128b_t = uint4; const uint pack_size = 16 / sizeof(scalar_t); const uint threads_per_head = head_size / pack_size; @@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel( const uint head_idx = token_head_idx % num_heads; const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc. - const uint head_offset = - token_idx * num_heads * head_size + head_idx * head_size; - const scalar_t* prefix_head_ptr = prefix_output + head_offset; - const scalar_t* suffix_head_ptr = suffix_output + head_offset; - scalar_t* output_head_ptr = output + head_offset; + const uint src_head_offset = token_idx * num_heads * prefix_head_stride + + head_idx * prefix_head_stride; + const uint dst_head_offset = token_idx * num_heads * output_head_stride + + head_idx * output_head_stride; + const scalar_t* prefix_head_ptr = prefix_output + src_head_offset; + const scalar_t* suffix_head_ptr = suffix_output + src_head_offset; + scalar_t* output_head_ptr = output + dst_head_offset; float p_lse = prefix_lse[head_idx * num_tokens + token_idx]; float s_lse = suffix_lse[head_idx * num_tokens + token_idx]; @@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel( reinterpret_cast(prefix_lse.data_ptr()), \ reinterpret_cast(suffix_output.data_ptr()), \ reinterpret_cast(suffix_lse.data_ptr()), num_tokens, \ - num_heads, head_size); \ + num_heads, head_size, prefix_head_stride, output_head_stride); \ } /*@brief Merges the attention states from prefix and suffix @@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output, const uint num_tokens = output.size(0); const uint num_heads = output.size(1); const uint head_size = output.size(2); + const uint prefix_head_stride = prefix_output.stride(1); + const uint output_head_stride = output.stride(1); const uint pack_size = 16 / sizeof(scalar_t); TORCH_CHECK(head_size % pack_size == 0, "headsize must be multiple of pack_size:", pack_size); - TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1, - "output heads must be contiguous in memory"); - TORCH_CHECK( - prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1, - "prefix_output heads must be contiguous in memory"); - TORCH_CHECK( - suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1, - "suffix_output heads must be contiguous in memory"); float* output_lse_ptr = nullptr; if (output_lse.has_value()) { output_lse_ptr = output_lse.value().data_ptr(); diff --git a/csrc/cache.h b/csrc/cache.h index b162a4a2bc31f..42ccb589683a9 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -8,16 +9,6 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst, const torch::Tensor& block_mapping); -// Note: the key_caches and value_caches vectors are constant but -// not the Tensors they contain. The vectors need to be const refs -// in order to satisfy pytorch's C++ operator registration code. -void copy_blocks(std::vector const& key_caches, - std::vector const& value_caches, - const torch::Tensor& block_mapping); - -void copy_blocks_mla(std::vector const& kv_caches, - const torch::Tensor& block_mapping); - void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, @@ -41,11 +32,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, const double scale, const std::string& kv_cache_dtype); void gather_and_maybe_dequant_cache( - torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] - torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] - torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] - torch::Tensor const& cu_seq_lens, // [BATCH+1] - int64_t batch_size, const std::string& kv_cache_dtype, + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS] + int64_t num_tokens, const std::string& kv_cache_dtype, torch::Tensor const& scale, std::optional seq_starts = std::nullopt); @@ -57,6 +49,15 @@ void cp_gather_cache( torch::Tensor const& cu_seq_lens, // [BATCH+1] int64_t batch_size, std::optional seq_starts = std::nullopt); +// Gather and upconvert FP8 KV cache to BF16 workspace +void cp_gather_and_upconvert_fp8_kv_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656] + torch::Tensor const& dst, // [TOT_TOKENS, 576] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& seq_lens, // [BATCH] + torch::Tensor const& workspace_starts, // [BATCH] + int64_t batch_size); + // Indexer K quantization and cache function void indexer_k_quant_and_cache( torch::Tensor& k, // [num_tokens, head_dim] @@ -71,4 +72,4 @@ void cp_gather_indexer_k_quant_cache( torch::Tensor& dst_k, // [num_tokens, head_dim] torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4] const torch::Tensor& block_table, // [batch_size, num_blocks] - const torch::Tensor& cu_seq_lens); // [batch_size + 1] \ No newline at end of file + const torch::Tensor& cu_seq_lens); // [batch_size + 1] diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 0aa0dc14c7480..cf26ae544deaa 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -2,6 +2,7 @@ #include #include #include +#include #include "cuda_utils.h" #include "cuda_compat.h" @@ -118,94 +119,6 @@ __global__ void copy_blocks_mla_kernel( } // namespace vllm -// Note: the key_caches and value_caches vectors are constant but -// not the Tensors they contain. The vectors need to be const refs -// in order to satisfy pytorch's C++ operator registration code. -void copy_blocks(std::vector const& key_caches, - std::vector const& value_caches, - const torch::Tensor& block_mapping) { - int num_layers = key_caches.size(); - TORCH_CHECK(num_layers == value_caches.size()); - if (num_layers == 0) { - return; - } - torch::Device cache_device = key_caches[0].device(); - TORCH_CHECK(cache_device.is_cuda()); - - // Create data structures for the kernel. - // Create an array of pointers to the key and value caches. - int64_t key_cache_ptrs[num_layers]; - int64_t value_cache_ptrs[num_layers]; - for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { - key_cache_ptrs[layer_idx] = - reinterpret_cast(key_caches[layer_idx].data_ptr()); - value_cache_ptrs[layer_idx] = - reinterpret_cast(value_caches[layer_idx].data_ptr()); - } - - // block_mapping is a 2D tensor with shape (num_pairs, 2). - int num_pairs = block_mapping.size(0); - - // Move the data structures to the GPU. - // NOTE: This synchronizes the CPU and GPU. - torch::Tensor key_cache_ptrs_tensor = - torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64) - .to(cache_device); - torch::Tensor value_cache_ptrs_tensor = - torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64) - .to(cache_device); - - // Launch the kernel. - const int numel_per_block = key_caches[0][0].numel(); - dim3 grid(num_layers, num_pairs); - dim3 block(std::min(1024, numel_per_block)); - const at::cuda::OptionalCUDAGuard device_guard(cache_device); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] { - vllm::copy_blocks_kernel<<>>( - key_cache_ptrs_tensor.data_ptr(), - value_cache_ptrs_tensor.data_ptr(), - block_mapping.data_ptr(), numel_per_block); - })); -} - -// copy blocks kernel for MLA (assumes a joint KV-cache) -void copy_blocks_mla(std::vector const& kv_caches, - const torch::Tensor& block_mapping) { - int num_layers = kv_caches.size(); - if (num_layers == 0) { - return; - } - torch::Device cache_device = kv_caches[0].device(); - TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA"); - - std::vector cache_ptrs(num_layers); - for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) { - cache_ptrs[layer_idx] = - reinterpret_cast(kv_caches[layer_idx].data_ptr()); - } - torch::Tensor cache_ptrs_tensor = - torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64) - .to(cache_device); - - int num_pairs = block_mapping.size(0); - // We use the stride instead of numel in case the cache is padded for memory - // alignment reasons, we assume the blocks data (inclusive of any padding) - // is contiguous in memory - int mem_footprint_per_block = kv_caches[0].stride(0); - dim3 grid(num_layers, num_pairs); - dim3 block(std::min(1024, mem_footprint_per_block)); - const at::cuda::OptionalCUDAGuard device_guard(cache_device); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] { - vllm::copy_blocks_mla_kernel<<>>( - cache_ptrs_tensor.data_ptr(), - block_mapping.data_ptr(), mem_footprint_per_block); - })); -} - namespace vllm { // Used to copy/convert one element @@ -514,7 +427,8 @@ __global__ void indexer_k_quant_and_cache_kernel( const int quant_block_size, // quantization block size const int cache_block_size, // cache block size const int cache_stride, // stride for each token in kv_cache - const bool use_ue8m0 // use ue8m0 scale format + + const bool use_ue8m0 // use ue8m0 scale format ) { constexpr int VEC_SIZE = 4; const int64_t token_idx = blockIdx.x; @@ -552,7 +466,11 @@ __global__ void indexer_k_quant_and_cache_kernel( #ifndef USE_ROCM __syncwarp(); #endif +#if defined(__gfx942__) + float scale = fmaxf(amax, 1e-4) / 224.0f; +#else float scale = fmaxf(amax, 1e-4) / 448.0f; +#endif if (use_ue8m0) { scale = exp2f(ceilf(log2f(scale))); } @@ -901,87 +819,80 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, namespace vllm { // grid is launched with dimensions (batch, num_splits) -template +template __global__ void gather_and_maybe_dequant_cache( - const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, - // ENTRIES...] - scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...] - const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] - const int32_t* __restrict__ cu_seq_lens, // [BATCH+1] - const int32_t block_size, const int32_t entry_size, + const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, + // ENTRIES...] + scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...] + const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] + const int32_t* __restrict__ cu_seq_lens, // [BATCH+1] + const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK] + const int32_t num_tokens, const int32_t block_size, const int64_t block_table_stride, const int64_t cache_block_stride, const int64_t cache_entry_stride, const int64_t dst_entry_stride, const float* __restrict__ scale, const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per // batch + constexpr int vec_size = sizeof(float4) / sizeof(scalar_t); + using ltype = vllm::vec_n_t; + using stype = vllm::vec_n_t; + // We are adding this for code readability which will be optimized out when + // build in release. + assert(CTA_SIZE == blockDim.x); - const int64_t bid = blockIdx.x; // Batch ID - const int32_t num_splits = gridDim.y; - const int32_t split = blockIdx.y; - const int32_t seq_start = cu_seq_lens[bid]; - const int32_t seq_end = cu_seq_lens[bid + 1]; - const int32_t seq_len = seq_end - seq_start; - const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size); - const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits); +#pragma unroll + for (int token_id = blockIdx.x; token_id < num_tokens; + token_id += gridDim.x) { + int64_t batch_id = token_to_seq[token_id]; + int64_t batch_start = cu_seq_lens[batch_id]; + int64_t batch_end = cu_seq_lens[batch_id + 1]; + int32_t batch_offset = token_id - batch_start; - const int32_t split_start = split * split_blocks; - const int32_t split_end = min((split + 1) * split_blocks, tot_blocks); + if (token_id >= batch_end) return; + int32_t offset = 0; + if (seq_starts != nullptr) { + offset = seq_starts[batch_id]; + } + batch_offset += offset; + int32_t block_table_id = batch_offset / block_size; + int32_t slot_id = batch_offset % block_size; + int32_t block_table_offset = batch_id * block_table_stride + block_table_id; + int32_t block_id = block_table[block_table_offset]; + int64_t cache_offset = + block_id * cache_block_stride + slot_id * cache_entry_stride; + constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size; + scalar_t* dst_ = dst + token_id * dst_entry_stride; + cache_t* src_ = const_cast(src_cache) + cache_offset; - const bool is_active_split = (split_start < tot_blocks); - const bool is_last_split = (split_end == tot_blocks); - - if (!is_active_split) return; - - int32_t full_blocks_end = split_end; - int32_t partial_block_size = 0; - - // Adjust the pointer for the block_table for this batch. - // If seq_starts is provided, compute an offset based on (seq_starts[bid] / - // page_size) - const int32_t batch_offset = bid * block_table_stride; - int32_t offset = 0; - if (seq_starts != nullptr) { - offset = seq_starts[bid] / block_size; - } - const int32_t* batch_block_table = block_table + batch_offset + offset; - - // Adjust dst pointer based on the cumulative sequence lengths. - dst += seq_start * dst_entry_stride; - - if (is_last_split) { - partial_block_size = seq_len % block_size; - if (partial_block_size) full_blocks_end -= 1; - } - - auto copy_entry = [&](const cache_t* __restrict__ _src, - scalar_t* __restrict__ _dst) { - for (int i = threadIdx.x; i < entry_size; i += blockDim.x) { +#pragma unroll + for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) { if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - _dst[i] = static_cast(_src[i]); + reinterpret_cast(dst_)[idx] = + static_cast(reinterpret_cast(src_)[idx]); } else { - _dst[i] = - fp8::scaled_convert(_src[i], *scale); + ltype loaded_val = reinterpret_cast(src_)[idx]; + stype store_val; +#pragma unroll + for (int j = 0; j < vec_size; ++j) { + store_val.val[j] = fp8::scaled_convert( + loaded_val.val[j], *scale); + } + reinterpret_cast(dst_)[idx] = store_val; } } - }; - - for (int pid = split_start; pid < full_blocks_end; ++pid) { - auto block_id = batch_block_table[pid]; - auto block_start_ptr = src_cache + block_id * cache_block_stride; - auto block_dst_ptr = dst + pid * block_size * dst_entry_stride; - for (int eid = 0; eid < block_size; ++eid) { - copy_entry(block_start_ptr + eid * cache_entry_stride, - block_dst_ptr + eid * dst_entry_stride); - } - } - - if (partial_block_size) { - auto block_id = batch_block_table[full_blocks_end]; - auto block_start_ptr = src_cache + block_id * cache_block_stride; - auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride; - for (int eid = 0; eid < partial_block_size; ++eid) { - copy_entry(block_start_ptr + eid * cache_entry_stride, - block_dst_ptr + eid * dst_entry_stride); + // process tail + constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size; + dst_ = dst_ + ENTRY_SIZE - tail_cnt; + src_ = src_ + ENTRY_SIZE - tail_cnt; +#pragma unroll + for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) { + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + dst_[idx] = static_cast(src_[idx]); + } else { + dst_[idx] = + fp8::scaled_convert(src_[idx], *scale); + } } } } @@ -992,34 +903,38 @@ __global__ void gather_and_maybe_dequant_cache( // SCALAR_T is the data type of the destination tensor. // CACHE_T is the stored data type of kv-cache. // KV_DTYPE is the real data type of kv-cache. -#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \ - vllm::gather_and_maybe_dequant_cache \ - <<>>( \ - reinterpret_cast(src_cache.data_ptr()), \ - reinterpret_cast(dst.data_ptr()), \ - block_table.data_ptr(), cu_seq_lens.data_ptr(), \ - block_size, entry_size, block_table_stride, cache_block_stride, \ - cache_entry_stride, dst_entry_stride, \ - reinterpret_cast(scale.data_ptr()), seq_starts_ptr); +#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \ + vllm::gather_and_maybe_dequant_cache \ + <<>>( \ + reinterpret_cast(src_cache.data_ptr()), \ + reinterpret_cast(dst.data_ptr()), \ + block_table.data_ptr(), cu_seq_lens.data_ptr(), \ + token_to_seq.data_ptr(), num_tokens, block_size, \ + block_table_stride, cache_block_stride, cache_entry_stride, \ + dst_entry_stride, reinterpret_cast(scale.data_ptr()), \ + seq_starts_ptr); // Gather sequences from the cache into the destination tensor. // - cu_seq_lens contains the cumulative sequence lengths for each batch // - block_table contains the cache block indices for each sequence +// - token_to_seq contains the back mapping from token_id to batch_id // - Optionally, seq_starts (if provided) offsets the starting block index by // (seq_starts[bid] / page_size) void gather_and_maybe_dequant_cache( - torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] - torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] - torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] - torch::Tensor const& cu_seq_lens, // [BATCH+1] - int64_t batch_size, const std::string& kv_cache_dtype, + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS] + int64_t num_tokens, const std::string& kv_cache_dtype, torch::Tensor const& scale, std::optional seq_starts = std::nullopt) { at::cuda::OptionalCUDAGuard device_guard(src_cache.device()); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); int32_t block_size = src_cache.size(1); - int32_t entry_size = src_cache.flatten(2, -1).size(2); + int32_t head_dim = dst.size(-1); TORCH_CHECK(block_table.dtype() == torch::kInt32, "block_table must be int32"); @@ -1029,6 +944,9 @@ void gather_and_maybe_dequant_cache( TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32, "seq_starts must be int32"); } + TORCH_CHECK(head_dim == 576, + "gather_and_maybe_dequant_cache only support the head_dim to 576 " + "for better performance") TORCH_CHECK(src_cache.device() == dst.device(), "src_cache and dst must be on the same device"); @@ -1046,10 +964,9 @@ void gather_and_maybe_dequant_cache( int64_t cache_entry_stride = src_cache.stride(1); int64_t dst_entry_stride = dst.stride(0); - // Decide on the number of splits based on the batch size. - int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16; - dim3 grid(batch_size, num_splits); - dim3 block(1024); + constexpr int32_t thread_block_size = 64; + dim3 grid(num_tokens); + dim3 block(thread_block_size); const int32_t* seq_starts_ptr = seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr; @@ -1058,6 +975,82 @@ void gather_and_maybe_dequant_cache( } namespace vllm { + +// Gather and upconvert FP8 KV cache tokens to BF16 workspace +// Similar to cp_gather_cache but specifically for FP8->BF16 conversion +__global__ void cp_gather_and_upconvert_fp8_kv_cache( + const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656] + __nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576] + const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] + const int32_t* __restrict__ seq_lens, // [BATCH] + const int32_t* __restrict__ workspace_starts, // [BATCH] + const int32_t block_size, const int32_t head_dim, + const int64_t block_table_stride, const int64_t cache_block_stride, + const int64_t cache_entry_stride, const int64_t dst_entry_stride) { + const int64_t bid = blockIdx.x; // Batch ID + const int32_t num_splits = gridDim.y; + const int32_t split = blockIdx.y; + const int32_t seq_start = workspace_starts[bid]; + const int32_t seq_len = seq_lens[bid]; + const int32_t tot_slots = seq_len; + const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits); + + const int32_t split_start = split * split_slots; + const int32_t split_end = min((split + 1) * split_slots, tot_slots); + + const bool is_active_split = (split_start < tot_slots); + + if (!is_active_split) return; + + // Adjust the pointer for the block_table for this batch + const int32_t batch_offset = bid * block_table_stride; + int32_t offset = split_start; + int32_t offset_div = offset / block_size; + offset = offset % block_size; + const int32_t* batch_block_table = block_table + batch_offset; + + // Adjust dst pointer based on the cumulative sequence lengths + dst += seq_start * dst_entry_stride; + + const int tid = threadIdx.x; + + // Process each token in this split + for (int pid = split_start; pid < split_end; ++pid) { + auto block_id = batch_block_table[offset_div]; + const uint8_t* token_ptr = + src_cache + block_id * cache_block_stride + offset * cache_entry_stride; + __nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride; + + // FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16) + const uint8_t* no_pe_ptr = token_ptr; + const float* scales_ptr = reinterpret_cast(token_ptr + 512); + const __nv_bfloat16* rope_ptr = + reinterpret_cast(token_ptr + 512 + 16); + + // Parallelize fp8 dequant (512 elements) and rope copy (64 elements) + if (tid < 512) { + // FP8 dequantization + const int tile = tid >> 7; // each tile is 128 elements + const float scale = scales_ptr[tile]; + const uint8_t val = no_pe_ptr[tid]; + dst_ptr[tid] = + fp8::scaled_convert<__nv_bfloat16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale); + } else if (tid < 576) { + // Rope copy (64 bf16 elements) + const int rope_idx = tid - 512; + dst_ptr[512 + rope_idx] = rope_ptr[rope_idx]; + } + + // Move to next token + offset += 1; + if (offset == block_size) { + offset_div += 1; + offset = 0; + } + } +} + template // Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by // block_size. @@ -1199,6 +1192,57 @@ void cp_gather_cache( } } +void cp_gather_and_upconvert_fp8_kv_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656] + torch::Tensor const& dst, // [TOT_TOKENS, 576] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& seq_lens, // [BATCH] + torch::Tensor const& workspace_starts, // [BATCH] + int64_t batch_size) { + at::cuda::OptionalCUDAGuard device_guard(src_cache.device()); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + int32_t block_size = src_cache.size(1); + int32_t head_dim = dst.size(1); + + TORCH_CHECK(block_table.dtype() == torch::kInt32, + "block_table must be int32"); + TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32"); + TORCH_CHECK(workspace_starts.dtype() == torch::kInt32, + "workspace_starts must be int32"); + + TORCH_CHECK(src_cache.device() == dst.device(), + "src_cache and dst must be on the same device"); + TORCH_CHECK(src_cache.device() == block_table.device(), + "src_cache and block_table must be on the same device"); + TORCH_CHECK(src_cache.device() == seq_lens.device(), + "src_cache and seq_lens must be on the same device"); + TORCH_CHECK(src_cache.device() == workspace_starts.device(), + "src_cache and workspace_starts must be on the same device"); + + TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8"); + TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16"); + TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA"); + + int64_t block_table_stride = block_table.stride(0); + int64_t cache_block_stride = src_cache.stride(0); + int64_t cache_entry_stride = src_cache.stride(1); + int64_t dst_entry_stride = dst.stride(0); + + // Decide on the number of splits based on the batch size + int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16; + dim3 grid(batch_size, num_splits); + dim3 block(576); + + vllm::cp_gather_and_upconvert_fp8_kv_cache<<>>( + src_cache.data_ptr(), + reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()), + block_table.data_ptr(), seq_lens.data_ptr(), + workspace_starts.data_ptr(), block_size, head_dim, + block_table_stride, cache_block_stride, cache_entry_stride, + dst_entry_stride); +} + // Macro to dispatch the kernel based on the data type. #define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \ vllm::indexer_k_quant_and_cache_kernel \ diff --git a/csrc/cpu/cpu_attn_macros.h b/csrc/cpu/cpu_arch_macros.h similarity index 51% rename from csrc/cpu/cpu_attn_macros.h rename to csrc/cpu/cpu_arch_macros.h index 6458e43419370..c73b62ecdec90 100644 --- a/csrc/cpu/cpu_attn_macros.h +++ b/csrc/cpu/cpu_arch_macros.h @@ -1,5 +1,5 @@ -#ifndef CPU_ATTN_MACROS_H -#define CPU_ATTN_MACROS_H +#ifndef CPU_ARCH_MACROS_H +#define CPU_ARCH_MACROS_H // x86_64 #ifdef __x86_64__ @@ -26,7 +26,7 @@ _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); \ const __m512i vec_127 = _mm512_set1_epi32(0x0000007f); \ const int n_mantissa_bits = 23; \ - auto fast_exp = [&](vec_op::FP32Vec16& vec) __attribute__(( \ + auto fast_exp = [&](const vec_op::FP32Vec16& vec) __attribute__(( \ always_inline)) { \ __m512 values = vec.reg; \ auto less_ln_flt_min_mask = \ @@ -60,4 +60,54 @@ #endif -#endif \ No newline at end of file +#ifdef __aarch64__ + // Implementation copied from Arm Optimized Routines (expf AdvSIMD) + // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c + #include + #define DEFINE_FAST_EXP \ + const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f); \ + const float ln2_hi = 0x1.62e4p-1f; \ + const float ln2_lo = 0x1.7f7d1cp-20f; \ + const float c0 = 0x1.0e4020p-7f; \ + const float c2 = 0x1.555e66p-3f; \ + const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2}; \ + const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000); \ + const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f); \ + const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f); \ + const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f); \ + const float32x4_t pos_special_bound = vdupq_n_f32(0x1.5d5e2ap+6f); \ + const float32x4_t neg_special_bound = vnegq_f32(pos_special_bound); \ + const float32x4_t inf = \ + vdupq_n_f32(std::numeric_limits::infinity()); \ + const float32x4_t zero = vdupq_n_f32(0.0f); \ + auto neon_expf = [&](float32x4_t values) __attribute__((always_inline)) { \ + float32x4_t n = vrndaq_f32(vmulq_f32(values, inv_ln2)); \ + float32x4_t r = vfmsq_laneq_f32(values, n, ln2_c02, 0); \ + r = vfmsq_laneq_f32(r, n, ln2_c02, 1); \ + uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(n)), 23); \ + float32x4_t scale = vreinterpretq_f32_u32(vaddq_u32(e, exponent_bias)); \ + float32x4_t r2 = vmulq_f32(r, r); \ + float32x4_t p = vfmaq_laneq_f32(c1, r, ln2_c02, 2); \ + float32x4_t q = vfmaq_laneq_f32(c3, r, ln2_c02, 3); \ + q = vfmaq_f32(q, p, r2); \ + p = vmulq_f32(c4, r); \ + float32x4_t poly = vfmaq_f32(p, q, r2); \ + poly = vfmaq_f32(scale, poly, scale); \ + const uint32x4_t hi_mask = vcgeq_f32(values, pos_special_bound); \ + const uint32x4_t lo_mask = vcleq_f32(values, neg_special_bound); \ + poly = vbslq_f32(hi_mask, inf, poly); \ + return vbslq_f32(lo_mask, zero, poly); \ + }; \ + auto fast_exp = [&](const vec_op::FP32Vec16& vec) \ + __attribute__((always_inline)) { \ + float32x4x4_t result; \ + result.val[0] = neon_expf(vec.reg.val[0]); \ + result.val[1] = neon_expf(vec.reg.val[1]); \ + result.val[2] = neon_expf(vec.reg.val[2]); \ + result.val[3] = neon_expf(vec.reg.val[3]); \ + return vec_op::FP32Vec16(result); \ + }; + +#endif // __aarch64__ + +#endif diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp index 50f17c758c148..02c722ba031a4 100644 --- a/csrc/cpu/cpu_attn.cpp +++ b/csrc/cpu/cpu_attn.cpp @@ -13,6 +13,18 @@ #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX: #endif +#ifdef __aarch64__ + #include "cpu_attn_neon.hpp" + #define NEON_DISPATCH(...) \ + case cpu_attention::ISA::NEON: { \ + using attn_impl = cpu_attention::AttentionImpl; \ + return __VA_ARGS__(); \ + } +#else + #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON: +#endif // #ifdef __aarch64__ + #define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \ case HEAD_DIM: { \ constexpr size_t head_dim = HEAD_DIM; \ @@ -41,6 +53,7 @@ [&] { \ switch (ISA_TYPE) { \ AMX_DISPATCH(__VA_ARGS__) \ + NEON_DISPATCH(__VA_ARGS__) \ case cpu_attention::ISA::VEC: { \ using attn_impl = \ cpu_attention::AttentionImpl #endif -#include "cpu_types.hpp" -#include "scratchpad_manager.h" -#include "cpu_attn_macros.h" -#include "utils.hpp" +#include "cpu/cpu_arch_macros.h" +#include "cpu/utils.hpp" namespace cpu_attention { -enum class ISA { AMX, VEC, VEC16 }; +enum class ISA { AMX, VEC, VEC16, NEON }; template class AttentionImpl {}; @@ -143,6 +141,12 @@ struct AttentionMetadata { case ISA::VEC: ss << "VEC, "; break; + case ISA::VEC16: + ss << "VEC16, "; + break; + case ISA::NEON: + ss << "NEON, "; + break; } ss << "workitem_group_num: " << workitem_group_num << ", reduction_item_num: " << reduction_item_num @@ -180,7 +184,7 @@ struct AttentionMetadata { // - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2 // * q_tile_size * 4, partial output, max + sum (float) // Reduction scratchpad contains: -// - flags: bool array to indicate wether the split is finished +// - flags: bool array to indicate whether the split is finished // - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size // - max, sum: 2 * split_num * q_tile_size * 4 class AttentionScratchPad { @@ -372,12 +376,13 @@ class AttentionScheduler { static constexpr int32_t MaxQTileIterNum = 128; - AttentionScheduler() : available_cache_size_(get_available_l2_size()) {} + AttentionScheduler() + : available_cache_size_(cpu_utils::get_available_l2_size()) {} torch::Tensor schedule(const ScheduleInput& input) const { const bool casual = input.casual; const int32_t thread_num = omp_get_max_threads(); - const int64_t cache_size = get_available_l2_size(); + const int64_t cache_size = cpu_utils::get_available_l2_size(); const int32_t max_num_q_per_iter = input.max_num_q_per_iter; const int32_t kv_len_alignment = input.kv_block_alignment; int32_t q_head_per_kv = input.num_heads_q / input.num_heads_kv; @@ -653,7 +658,7 @@ class AttentionScheduler { metadata_ptr->thread_num + metadata_ptr->reduction_scratchpad_size_per_kv_head * (use_gqa ? input.num_heads_kv : input.num_heads_q); - DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc( + cpu_utils::ScratchPadManager::get_scratchpad_manager()->realloc( scratchpad_size); // metadata_ptr->print(); @@ -661,7 +666,7 @@ class AttentionScheduler { // test out of boundary access // { // float* cache_ptr = - // DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data(); + // cpu_utils::ScratchPadManager::getl_scratchpad_manager()->get_data(); // for (int64_t i = 0; i < scratchpad_size / sizeof(float); ++i) { // cache_ptr[i] = std::numeric_limits::quiet_NaN(); // } @@ -743,27 +748,6 @@ class AttentionScheduler { return std::max(rounded_tile_size, round_size); } - static int64_t get_available_l2_size() { - static int64_t size = []() { -#if defined(__APPLE__) - // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname. - int64_t l2_cache_size = 0; - size_t len = sizeof(l2_cache_size); - if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 && - l2_cache_size > 0) { - return l2_cache_size >> 1; // use 50% of L2 cache - } - // Fallback if sysctlbyname fails - return 128LL * 1024 >> 1; // use 50% of 128KB -#else - long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE); - TORCH_CHECK_NE(l2_cache_size, -1); - return l2_cache_size >> 1; // use 50% of L2 cache -#endif - }(); - return size; - } - private: int64_t available_cache_size_; }; @@ -841,7 +825,7 @@ struct VecTypeTrait { }; #endif -#if !defined(__powerpc__) +#if !defined(__powerpc__) && !defined(__s390x__) template <> struct VecTypeTrait { using vec_t = vec_op::FP16Vec16; @@ -1240,14 +1224,8 @@ class AttentionMainLoop { // rescale sum and partial outputs if (need_rescale) { // compute rescale factor -#ifdef DEFINE_FAST_EXP - vec_op::FP32Vec16 rescale_factor_vec(rescale_factor); - rescale_factor_vec = fast_exp(rescale_factor_vec); - rescale_factor = rescale_factor_vec.get_last_elem(); -#else rescale_factor = std::exp(rescale_factor); vec_op::FP32Vec16 rescale_factor_vec(rescale_factor); -#endif // rescale sum new_sum_val += rescale_factor * init_sum_val; @@ -1402,7 +1380,7 @@ class AttentionMainLoop { // init buffers void* scratchpad_ptr = - DNNLScratchPadManager::get_dnnl_scratchpad_manager() + cpu_utils::ScratchPadManager::get_scratchpad_manager() ->get_data(); AttentionScratchPad buffer_manager(thread_id, metadata, scratchpad_ptr); @@ -1422,8 +1400,7 @@ class AttentionMainLoop { } } - const int64_t available_cache_size = - AttentionScheduler::get_available_l2_size(); + const int64_t available_cache_size = cpu_utils::get_available_l2_size(); const int32_t default_tile_size = AttentionScheduler::calcu_default_tile_size( available_cache_size, head_dim, sizeof(kv_cache_t), @@ -1883,15 +1860,8 @@ class AttentionMainLoop { : curr_output_buffer; float rescale_factor = final_max > curr_max ? curr_max - final_max : final_max - curr_max; - -#ifdef DEFINE_FAST_EXP - vec_op::FP32Vec16 rescale_factor_vec(rescale_factor); - rescale_factor_vec = fast_exp(rescale_factor_vec); - rescale_factor = rescale_factor_vec.get_last_elem(); -#else rescale_factor = std::exp(rescale_factor); vec_op::FP32Vec16 rescale_factor_vec(rescale_factor); -#endif local_sum[head_idx] = final_max > curr_max ? final_sum + rescale_factor * curr_sum diff --git a/csrc/cpu/cpu_attn_neon.hpp b/csrc/cpu/cpu_attn_neon.hpp new file mode 100644 index 0000000000000..827f0cfbc718e --- /dev/null +++ b/csrc/cpu/cpu_attn_neon.hpp @@ -0,0 +1,386 @@ +#ifndef CPU_ATTN_NEON_HPP +#define CPU_ATTN_NEON_HPP + +#include "cpu_attn_impl.hpp" +#include +#include +namespace cpu_attention { + +namespace { + +#define BLOCK_SIZE_ALIGNMENT 32 +#define HEAD_SIZE_ALIGNMENT 32 +#define MAX_Q_HEAD_NUM_PER_ITER 16 + +// These do not use vectorized class for loading / converting +// because csrc/cpu/cpu_types_arm.hpp does not have fallback options +// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that +// doesn't support BF16. +// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency. +template +FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0, + float32x4_t& b1); + +template <> +FORCE_INLINE void load_row8_B_as_f32(const float* p, float32x4_t& b0, + float32x4_t& b1) { + b0 = vld1q_f32(p + 0); + b1 = vld1q_f32(p + 4); +} + +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::Half* p, + float32x4_t& b0, + float32x4_t& b1) { + const float16_t* h = reinterpret_cast(p); + float16x8_t v = vld1q_f16(h); + b0 = vcvt_f32_f16(vget_low_f16(v)); + b1 = vcvt_f32_f16(vget_high_f16(v)); +} + +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::BFloat16* p, + float32x4_t& b0, + float32x4_t& b1) { + const uint16_t* u = reinterpret_cast(p); +#ifdef ARM_BF16_SUPPORT + uint16x8_t u0 = vld1q_u16(u); + bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0); + b0 = vcvtq_low_f32_bf16(bf0); + b1 = vcvtq_high_f32_bf16(bf0); +#else + uint16x8_t x0 = vld1q_u16(u); + uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16); + uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16); + b0 = vreinterpretq_f32_u32(lo); + b1 = vreinterpretq_f32_u32(hi); +#endif +} + +// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs +// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2) +// #FMLAs = (K // 4) * (4 * 2 * M) +// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads +template +FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4( + const float* __restrict A, // [M x K], + const kv_cache_t* __restrict B, // [K x 8], + float* __restrict C, // [M x 8], + int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) { + // kernel supports max M of 8, as it'd spill for larger M + static_assert(1 <= M && M <= 8, "M must be in [1,8]"); + +// helpers for per-M codegen +#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7) +#define IF_M(i) if constexpr (M > (i)) + + // A row base pointers +#define DECL_A(i) const float* a##i = A + (i) * lda; + ROWS_APPLY(DECL_A) +#undef DECL_A + + // declare 2 accumulators per row of M +#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1; + ROWS_APPLY(DECL_ACC) +#undef DECL_ACC + + // initialize accumulators +#define INIT_ACC(i) \ + IF_M(i) { \ + if (accumulate) { \ + acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \ + acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \ + } else { \ + acc##i##_0 = vdupq_n_f32(0.f); \ + acc##i##_1 = vdupq_n_f32(0.f); \ + } \ + } + ROWS_APPLY(INIT_ACC) +#undef INIT_ACC + + int32_t k = 0; + + // K unrolled by 4 + for (; k + 3 < K; k += 4) { + // load A[k..k+3] for each active row (M) +#define LOAD_A4(i) \ + float32x4_t a##i##v; \ + IF_M(i) a##i##v = vld1q_f32(a##i + k); + ROWS_APPLY(LOAD_A4) +#undef LOAD_A4 + + // helper: FMA lane L from aiv +#define FMAS_LANE(i, aiv, L) \ + IF_M(i) { \ + acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \ + acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \ + } + + // k + 0 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 0) * ldb, b0, b1); +#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0) + ROWS_APPLY(STEP_K0) +#undef STEP_K0 + } + // k + 1 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 1) * ldb, b0, b1); +#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1) + ROWS_APPLY(STEP_K1) +#undef STEP_K1 + } + // k + 2 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 2) * ldb, b0, b1); +#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2) + ROWS_APPLY(STEP_K2) +#undef STEP_K2 + } + // k + 3 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 3) * ldb, b0, b1); +#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3) + ROWS_APPLY(STEP_K3) +#undef STEP_K3 + } +#undef FMAS_LANE + } + + // K tail + for (; k < K; ++k) { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)k * ldb, b0, b1); +#define TAIL_ROW(i) \ + IF_M(i) { \ + float32x4_t ai = vdupq_n_f32(*(a##i + k)); \ + acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \ + acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \ + } + ROWS_APPLY(TAIL_ROW) +#undef TAIL_ROW + } + + // store accumulators to C +#define STORE_ROW(i) \ + IF_M(i) { \ + vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \ + vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \ + } + ROWS_APPLY(STORE_ROW) +#undef STORE_ROW + +#undef ROWS_APPLY +#undef IF_M +} + +template +FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A, + const kv_cache_t* __restrict B, + float* __restrict C, int32_t M, + int32_t K, int64_t lda, + int64_t ldb, int64_t ldc, + bool accumulate) { + // micro kernel is Mx8 + static_assert(N % 8 == 0, "N must be a multiple of 8"); + for (int32_t m = 0; m < M;) { + int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1; + const float* Ab = A + m * lda; + float* Cb = C + m * ldc; + + for (int32_t n = 0; n < N; n += 8) { + const kv_cache_t* Bn = B + n; + float* Cn = Cb + n; + switch (mb) { + case 8: + gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + case 4: + gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + case 2: + gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + default: + gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + } + } + // no tail loop for N as it's guaranteed to be a multiple of 8 + m += mb; + } +} + +template +class TileGemmNeonFMLA { + public: + template + FORCE_INLINE static void gemm(const int32_t m_size, + float* __restrict__ a_tile, + kv_cache_t* __restrict__ b_tile, + float* __restrict__ c_tile, const int64_t lda, + const int64_t ldb, const int64_t ldc, + const int32_t block_size, + const int32_t dynamic_k_size, + const bool accum_c) { + if constexpr (phase == AttentionGemmPhase::QK) { + gemm_macro_neon_fmla_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c); + } else { + gemm_macro_neon_fmla_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc, + accum_c); + } + } +}; + +} // namespace + +// this is similar to "ISA::VEC" at the moment +template +class AttentionImpl { + public: + using query_t = scalar_t; + using q_buffer_t = float; + using kv_cache_t = scalar_t; + using logits_buffer_t = float; + using partial_output_buffer_t = float; + using prob_buffer_t = float; + + constexpr static int64_t BlockSizeAlignment = + BLOCK_SIZE_ALIGNMENT; // KV token num unit of QK and PV phases + constexpr static int64_t HeadDimAlignment = + HEAD_SIZE_ALIGNMENT; // headdim num unit of PV phase + constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER; + constexpr static int64_t HeadDim = head_dim; + constexpr static ISA ISAType = ISA::NEON; + constexpr static bool scale_on_logits = false; // apply scale on q_buffer + + static_assert(HeadDim % HeadDimAlignment == 0); + // the gemm micro kernel is Mx8 + static_assert(HeadDimAlignment % 8 == 0); + static_assert(BlockSizeAlignment % 8 == 0); + + public: + template